Merge remote-tracking branch 'mirror/v0.8.2' into v0.8.2-ori

31f6b24f · zhuwenwen · 89d1dd57 · 25f560a6 · 31f6b24f · 31f6b24f
Commit 31f6b24f authored Mar 26, 2025 by zhuwenwen
20 changed files
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -22,7 +22,7 @@ docker run --privileged --net host --shm-size=16G -it \
    && export VLLM_USE_V1=1 \
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
    && echo TEST_1 \
-    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && pytest /workspace/vllm/tests/tpu/test_compilation.py \
    && echo TEST_2 \
    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
    && echo TEST_3 \

--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,8 @@
 /vllm/_version.py

 # vllm-flash-attn built from source
-vllm/vllm_flash_attn/
+vllm/vllm_flash_attn/*
+!vllm/vllm_flash_attn/fa_utils.py

 # Byte-compiled / optimized / DLL files
 __pycache__/

--- a/Dockerfile
+++ b/Dockerfile
@@ -14,17 +14,22 @@ ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive

-# Install minimal dependencies and uv
-RUN apt-get update -y \
-    && apt-get install -y ccache git curl wget sudo \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Add uv to PATH
-ENV PATH="/root/.local/bin:$PATH"
-# Create venv with specified Python and activate by placing at the front of path
-ENV VIRTUAL_ENV="/opt/venv"
-RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl sudo \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@@ -46,20 +51,22 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

 WORKDIR /workspace

+# install build and runtime dependencies
+
 # arm64 (GH200) build follows the practice of "use existing pytorch" build,
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
    fi

 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/cuda.txt
+    uv pip install --system -r requirements/cuda.txt

 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -84,7 +91,7 @@ COPY requirements/build.txt requirements/build.txt
 ENV UV_HTTP_TIMEOUT=500

 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt

 COPY . .
 ARG GIT_REPO_CHECK=0
@@ -156,7 +163,7 @@ COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/dev.txt
+    uv pip install --system -r requirements/dev.txt
 #################### DEV IMAGE ####################

 #################### vLLM installation IMAGE ####################
@@ -172,18 +179,23 @@ ARG TARGETPLATFORM
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

-# Install minimal dependencies and uv
-RUN apt-get update -y \
-    && apt-get install -y ccache git curl wget sudo vim \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Add uv to PATH
-ENV PATH="/root/.local/bin:$PATH"
-# Create venv with specified Python and activate by placing at the front of path
-ENV VIRTUAL_ENV="/opt/venv"
-RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@@ -201,14 +213,14 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
    fi

 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install dist/*.whl --verbose
+    uv pip install --system dist/*.whl --verbose

 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
@@ -223,8 +235,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl

 RUN --mount=type=cache,target=/root/.cache/uv \
+. /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
+    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples

@@ -234,7 +247,7 @@ COPY examples examples
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements/build.txt requirements/build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt

 #################### vLLM installation IMAGE ####################

@@ -251,15 +264,15 @@ ENV UV_HTTP_TIMEOUT=500

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/dev.txt
+    uv pip install --system -r requirements/dev.txt

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -e tests/vllm_test_utils
+    uv pip install --system -e tests/vllm_test_utils

 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install hf_transfer
+    uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1

 # Copy in the v1 package for testing (it isn't distributed yet)
@@ -284,9 +297,9 @@ ENV UV_HTTP_TIMEOUT=500
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    else \
-        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
    fi

 ENV VLLM_USAGE_SOURCE production-docker-image

--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -40,7 +40,7 @@ ARG USE_CYTHON
 RUN cd vllm \
    && python3 -m pip install -r requirements/rocm.txt \
    && python3 setup.py clean --all  \
-    && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
+    && if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
    && python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR

--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -42,7 +42,7 @@ namespace marlin {
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                    int const* __restrict__ perm_int_ptr,
                                    int4* __restrict__ out_int4_ptr, int size_m,
-                                    int size_k, int block_rows) {}
+                                    int size_k, int lda, int block_rows) {}

 template <typename scalar_t,  // compute dtype, half or nv_float16
          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
@@ -459,7 +459,7 @@ __device__ inline void barrier_release(int* lock, bool reset = false) {
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                    int const* __restrict__ perm_int_ptr,
                                    int4* __restrict__ out_int4_ptr, int size_m,
-                                    int size_k, int block_rows) {
+                                    int size_k, int lda, int block_rows) {
  int start_row = block_rows * blockIdx.x;
  int finish_row = start_row + block_rows;
  if (finish_row > size_m) {
@@ -467,16 +467,19 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
  }
  int cur_block_rows = finish_row - start_row;

-  int row_stride = size_k * sizeof(half) / 16;
+  int input_row_stride = lda * sizeof(half) / 16;
+  int output_row_stride = size_k * sizeof(half) / 16;

  auto permute_row = [&](int row) {
    int iters = size_k / default_threads;
    int rest = size_k % default_threads;

-    int offset = row * row_stride;
+    int input_offset = row * input_row_stride;
+    int output_offset = row * output_row_stride;

-    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
-    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
+    half const* a_row_half =
+        reinterpret_cast<half const*>(a_int4_ptr + input_offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + output_offset);

    int base_k = 0;

@@ -537,6 +540,7 @@ __global__ void Marlin(
    int prob_m,           // batch dimension m
    int prob_n,           // output dimension n
    int prob_k,           // reduction dimension k
+    int lda,              // A.stride(0), equal to prob_k is A is contiguous
    int* locks,           // extra global storage for barrier synchronization
    bool use_atomic_add,  // whether to use atomic add to reduce
    bool use_fp32_reduce  // whether to use fp32 global reduce
@@ -600,7 +604,7 @@ __global__ void Marlin(
  // We can easily implement parallel problem execution by just remapping
  // indices and advancing global pointers
  if (slice_col_par >= n_tiles) {
-    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * lda / 8;
    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
    locks += (slice_col_par / n_tiles) * n_tiles;
    slice_col = slice_col_par % n_tiles;
@@ -631,7 +635,7 @@ __global__ void Marlin(
      }
    }
    if (slice_col == n_tiles) {
-      A += 16 * thread_m_blocks * prob_k / 8;
+      A += 16 * thread_m_blocks * lda / 8;
      C += 16 * thread_m_blocks * prob_n / 8;
      locks += n_tiles;
      slice_col = 0;
@@ -643,7 +647,7 @@ __global__ void Marlin(
  // A sizes/strides

  // stride of the A matrix in global memory
-  int a_gl_stride = prob_k / 8;
+  int a_gl_stride = lda / 8;
  // stride of an A matrix tile in shared memory
  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
  // delta between subsequent A tiles in global memory
@@ -1780,8 +1784,8 @@ __global__ void Marlin(
               HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT>                              \
            <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
-                num_groups, prob_m, prob_n, prob_k, locks, use_atomic_add,     \
-                use_fp32_reduce);                                              \
+                num_groups, prob_m, prob_n, prob_k, lda, locks,                \
+                use_atomic_add, use_fp32_reduce);                              \
      }                                                                        \
    }

@@ -2071,7 +2075,7 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
 template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
               void* zp, void* g_idx, void* perm, void* a_tmp, int prob_m,
-               int prob_n, int prob_k, void* workspace,
+               int prob_n, int prob_k, int lda, void* workspace,
               vllm::ScalarType const& q_type, bool has_act_order,
               bool is_k_full, bool has_zp, int num_groups, int group_size,
               int dev, cudaStream_t stream, int thread_k, int thread_n,
@@ -2184,8 +2188,9 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
    // Permute A columns
    int block_rows = div_ceil(prob_m, blocks);
    permute_cols_kernel<<<blocks, default_threads, 0, stream>>>(
-        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
+        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, lda, block_rows);
    A_ptr = a_tmp_ptr;
+    lda = prob_k;
  }

  // If we have a full K, then we can run the non-act-order version of Marlin
@@ -2244,7 +2249,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
                  ", num_bits = ", num_bits);
    }

-    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
+    A_ptr += 16 * thread_m_blocks * (lda / 8) * par;
    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
  }
 }
@@ -2300,7 +2305,10 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,

  // Verify device and strides
  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
-  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+  TORCH_CHECK(a.stride(1) == 1, "A.stride(1) is not 1");
+  // We use int4 (16 bytes) to load A, so A must aligned to 16 bytes
+  TORCH_CHECK(a.stride(0) % 8 == 0, "A.stride(0) must divisible by 8");
+  TORCH_CHECK(((uint64_t)a.data_ptr()) % 16 == 0, "A must aligned to 16 bytes");

  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
@@ -2432,7 +2440,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
        c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
+        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k, a.stride(0),
        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
        thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
@@ -2443,10 +2451,10 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
        b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
        perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
-        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
-        use_fp32_reduce, is_zp_float);
+        a.stride(0), workspace.data_ptr(), b_q_type, has_act_order, is_k_full,
+        has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        marlin::max_par, use_atomic_add, use_fp32_reduce, is_zp_float);
  } else {
    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
  }

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -85,6 +85,7 @@ html_static_path = ["_static"]
 html_js_files = ["custom.js"]
 html_css_files = ["custom.css"]

+myst_heading_anchors = 2
 myst_url_schemes = {
    'http': None,
    'https': None,

--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -4,6 +4,9 @@

 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.

+* [Deployment with CPUs](#deployment-with-cpus)
+* [Deployment with GPUs](#deployment-with-gpus)
+
 Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 * [Helm](frameworks/helm.md)
 * [InftyAI/llmaz](integrations/llmaz.md)
@@ -14,11 +17,107 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 * [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
 * [vllm-project/production-stack](integrations/production-stack.md)

-## Pre-requisite
+## Deployment with CPUs
+
+:::{note}
+The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs.
+:::
+
+First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
+
+```bash
+cat <<EOF |kubectl apply -f -
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: vllm-models
+spec:
+  accessModes:
+    - ReadWriteOnce
+  volumeMode: Filesystem
+  resources:
+    requests:
+      storage: 50Gi
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token-secret
+type: Opaque
+data:
+  token: $(HF_TOKEN)
+```
+
+Next, start the vLLM server as a Kubernetes Deployment and Service:
+
+```bash
+cat <<EOF |kubectl apply -f -
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-server
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vllm
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vllm
+    spec:
+      containers:
+      - name: vllm
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve meta-llama/Llama-3.2-1B-Instruct"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+          - containerPort: 8000
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.cache/huggingface
+      volumes:
+      - name: llama-storage
+        persistentVolumeClaim:
+          claimName: vllm-models
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-server
+spec:
+  selector:
+    app.kubernetes.io/name: vllm
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
+  type: ClusterIP
+EOF
+```
+
+We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
+
+```console
+kubectl logs -l app.kubernetes.io/name=vllm
+...
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+```

-Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
+## Deployment with GPUs

-## Deployment using native K8s
+**Pre-requisite**: Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).

 1. Create a PVC, Secret and Deployment for vLLM


--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
 Below are the steps to utilize BitsAndBytes with vLLM.

 ```console
-pip install bitsandbytes>=0.45.0
+pip install bitsandbytes>=0.45.3
 ```

 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.

--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -193,7 +193,7 @@ vLLM CPU backend supports the following vLLM features:

 ## Related runtime environment variables

- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
 - `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).


--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@@ -156,6 +156,9 @@ vLLM V1 is currently optimized for decoder-only transformers. Models requiring

 For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).

-## FAQ
+## Frequently Asked Questions

-TODO
+**I'm using vLLM V1 and I'm getting CUDA OOM errors. What should I do?**
+The default `max_num_seqs` has been raised from `256` in V0 to `1024` in V1. If you encounter CUDA OOM only when using V1 engine, try setting a lower value of `max_num_seqs` or `gpu_memory_utilization`.
+
+On the other hand, if you get an error about insufficient memory for the cache blocks, you should increase `gpu_memory_utilization` as this indicates that your GPU has sufficient memory but you're not allocating enough to vLLM for KV cache blocks.
--- a/docs/source/models/extensions/fastsafetensor.md
+++ b/docs/source/models/extensions/fastsafetensor.md
+Loading Model weights with fastsafetensors
+===================================================================
+
+Using fastsafetensor library enables loading model weights to GPU memory by leveraging GPU direct storage. See https://github.com/foundation-model-stack/fastsafetensors for more details.
+For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
--- a/docs/source/models/extensions/index.md
+++ b/docs/source/models/extensions/index.md
@@ -5,4 +5,5 @@

 runai_model_streamer
 tensorizer
+fastsafetensor
 :::
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -73,7 +73,7 @@ The Transformers fallback explicitly supports the following features:

 - <project:#quantization-index> (except GGUF)
 - <project:#lora-adapter>
- <project:#distributed-serving> (pipeline parallel coming soon <gh-pr:12832>!)
+- <project:#distributed-serving> (requires `transformers>=4.49.0`)

 #### Remote code


--- a/docs/source/serving/engine_args.md
+++ b/docs/source/serving/engine_args.md
@@ -2,7 +2,12 @@

 # Engine Arguments

-Below, you can find an explanation of every engine argument for vLLM:
+Engine arguments control the behavior of the vLLM engine.
+
+- For [offline inference](#offline-inference), they are part of the arguments to `LLM` class.
+- For [online serving](#openai-compatible-server), they are part of the arguments to `vllm serve`.
+
+Below, you can find an explanation of every engine argument:

 <!--- pyml disable-num-lines 7 no-space-in-emphasis -->
 ```{eval-rst}
@@ -15,7 +20,7 @@ Below, you can find an explanation of every engine argument for vLLM:

 ## Async Engine Arguments

-Below are the additional arguments related to the asynchronous engine:
+Additional arguments are available to the asynchronous engine which is used for online serving:

 <!--- pyml disable-num-lines 7 no-space-in-emphasis -->
 ```{eval-rst}

--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -97,6 +97,13 @@ llm = LLM(model="adept/fuyu-8b",
          max_num_seqs=2)
 ```

+#### Adjust cache size
+
+If you run out of CPU RAM, try the following options:
+
+- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
+- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
+
 ### Performance optimization and tuning

 You can potentially improve the performance of vLLM by finetuning various options.

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -86,6 +86,7 @@ exclude = [
 "vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
 "vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
+"vllm/utils.py" = ["UP006", "UP035"]

 [tool.ruff.lint]
 select = [

--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -18,7 +18,7 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
-llguidance >= 0.7.2, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
+llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
 xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"

--- a/requirements/test.in
+++ b/requirements/test.in
@@ -41,3 +41,4 @@ tritonclient==2.51.0
 numpy < 2.0.0
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
+fastsafetensors>=0.1.10
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -67,6 +67,7 @@ click==8.1.7
    #   jiwer
    #   nltk
    #   ray
+    #   typer
 colorama==0.4.6
    # via
    #   awscli
@@ -122,6 +123,8 @@ fastparquet==2024.11.0
    # via genai-perf
 fastrlock==0.8.2
    # via cupy-cuda12x
+fastsafetensors==0.1.10
+    # via -r requirements/test.in
 filelock==3.16.1
    # via
    #   datasets
@@ -505,7 +508,9 @@ requests==2.32.3
 responses==0.25.3
    # via genai-perf
 rich==13.9.4
-    # via genai-perf
+    # via
+    #   genai-perf
+    #   typer
 rouge-score==0.1.2
    # via lm-eval
 rpds-py==0.20.1
@@ -550,6 +555,8 @@ setuptools==75.8.0
    # via
    #   pytablewriter
    #   torch
+shellingham==1.5.4
+    # via typer
 six==1.16.0
    # via
    #   python-dateutil
@@ -600,6 +607,7 @@ torch==2.6.0
    #   accelerate
    #   bitsandbytes
    #   encodec
+    #   fastsafetensors
    #   lm-eval
    #   peft
    #   runai-model-streamer
@@ -654,6 +662,8 @@ typepy==1.3.2
    #   dataproperty
    #   pytablewriter
    #   tabledata
+typer==0.15.2
+    # via fastsafetensors
 typing-extensions==4.12.2
    # via
    #   huggingface-hub
@@ -663,6 +673,7 @@ typing-extensions==4.12.2
    #   pydantic
    #   pydantic-core
    #   torch
+    #   typer
 tzdata==2024.2
    # via pandas
 urllib3==2.2.3

--- a/setup.py
+++ b/setup.py
@@ -680,6 +680,7 @@ setup(
    install_requires=get_requirements(),
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
+        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
        "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
        "audio": ["librosa", "soundfile"],  # Required for audio processing
        "video": ["decord"]  # Required for video processing