Commit cc7f22a8 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.1' into v0.9.1-ori

parents b9ea0c09 b6553be1
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools import itertools
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses import dataclasses
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
import torch import torch
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch import torch
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch import torch
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import sys import sys
SPDX_HEADER = "# SPDX-License-Identifier: Apache-2.0" SPDX_HEADER = (
"# SPDX-License-Identifier: Apache-2.0\n"
"# SPDX-FileCopyrightText: Copyright contributors to the vLLM project")
SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:" SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:"
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import subprocess import subprocess
import sys import sys
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations from __future__ import annotations
import subprocess import subprocess
...@@ -58,6 +59,9 @@ def main() -> int: ...@@ -58,6 +59,9 @@ def main() -> int:
if not Path(filepath).exists(): if not Path(filepath).exists():
continue continue
if filepath == "setup.py":
continue
violations = check_file(filepath) violations = check_file(filepath)
if violations: if violations:
print(f"\n{filepath}:") print(f"\n{filepath}:")
......
Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package. Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package.
Here we break down the requirements in 3 steps: Here we break down the requirements in 2 steps:
1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this. 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
2. Build and install the system libraries (GDR Copy). This step requires root access. You can do it inside a Docker container so that they can be shipped as a single image. 2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
3. Build and install the system drivers (GDR Copy, and necessary modifications to NVIDIA driver to enable IBGDA). This step requires root access, and must be done on the host machine.
2 and 3 are necessary for multi-node deployment. 2 is necessary for multi-node deployment.
All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`. All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
...@@ -21,7 +20,6 @@ bash install_python_libraries.sh ...@@ -21,7 +20,6 @@ bash install_python_libraries.sh
```bash ```bash
bash install_python_libraries.sh bash install_python_libraries.sh
sudo bash install_system_libraries.sh sudo bash configure_system_drivers.sh
sudo bash install_system_drivers.sh
sudo reboot # Reboot is required to load the new driver sudo reboot # Reboot is required to load the new driver
``` ```
set -ex set -ex
# prepare workspace directory
WORKSPACE=$1
if [ -z "$WORKSPACE" ]; then
export WORKSPACE=$(pwd)/ep_kernels_workspace
fi
if [ ! -d "$WORKSPACE" ]; then
mkdir -p $WORKSPACE
fi
# build and install gdrcopy driver
pushd $WORKSPACE
cd gdrcopy_src
./insmod.sh
# run gdrcopy_copybw to test the installation
$WORKSPACE/gdrcopy_install/bin/gdrcopy_copybw
# turn on IBGDA # turn on IBGDA
echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
update-initramfs -u update-initramfs -u
......
...@@ -13,16 +13,6 @@ fi ...@@ -13,16 +13,6 @@ fi
# install dependencies if not installed # install dependencies if not installed
pip3 install cmake torch ninja pip3 install cmake torch ninja
# build gdrcopy, required by nvshmem
pushd $WORKSPACE
wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.4.4.tar.gz
mkdir -p gdrcopy_src
tar -xvf v2.4.4.tar.gz -C gdrcopy_src --strip-components=1
pushd gdrcopy_src
make -j$(nproc)
make prefix=$WORKSPACE/gdrcopy_install install
popd
# build nvshmem # build nvshmem
pushd $WORKSPACE pushd $WORKSPACE
mkdir -p nvshmem_src mkdir -p nvshmem_src
...@@ -34,26 +24,30 @@ git init ...@@ -34,26 +24,30 @@ git init
git apply -vvv nvshmem.patch git apply -vvv nvshmem.patch
# assume CUDA_HOME is set correctly # assume CUDA_HOME is set correctly
export GDRCOPY_HOME=$WORKSPACE/gdrcopy_install if [ -z "$CUDA_HOME" ]; then
echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
exit 1
fi
# disable all features except IBGDA
export NVSHMEM_IBGDA_SUPPORT=1
export NVSHMEM_SHMEM_SUPPORT=0 export NVSHMEM_SHMEM_SUPPORT=0
export NVSHMEM_UCX_SUPPORT=0 export NVSHMEM_UCX_SUPPORT=0
export NVSHMEM_USE_NCCL=0 export NVSHMEM_USE_NCCL=0
export NVSHMEM_IBGDA_SUPPORT=1
export NVSHMEM_PMIX_SUPPORT=0 export NVSHMEM_PMIX_SUPPORT=0
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0 export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
export NVSHMEM_USE_GDRCOPY=1 export NVSHMEM_USE_GDRCOPY=0
export NVSHMEM_IBRC_SUPPORT=1 export NVSHMEM_IBRC_SUPPORT=0
# remove MPI dependency
export NVSHMEM_BUILD_TESTS=0 export NVSHMEM_BUILD_TESTS=0
export NVSHMEM_BUILD_EXAMPLES=0 export NVSHMEM_BUILD_EXAMPLES=0
export NVSHMEM_MPI_SUPPORT=0 export NVSHMEM_MPI_SUPPORT=0
export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
export NVSHMEM_BUILD_TXZ_PACKAGE=0
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
cmake -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
cmake --build $WORKSPACE/nvshmem_build/ --target install
cd $WORKSPACE/nvshmem_build/
make -j$(nproc)
make install
popd popd
......
set -ex
# prepare workspace directory
WORKSPACE=$1
if [ -z "$WORKSPACE" ]; then
export WORKSPACE=$(pwd)/ep_kernels_workspace
fi
if [ ! -d "$WORKSPACE" ]; then
mkdir -p $WORKSPACE
fi
# build and install gdrcopy system packages
pushd $WORKSPACE
cd gdrcopy_src/packages
apt install devscripts -y
CUDA=${CUDA_HOME:-/usr/local/cuda} ./build-deb-packages.sh
dpkg -i *.deb
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import json import json
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import copy import copy
......
#!/usr/bin/env python3 #!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) 2018 The Chromium Authors. All rights reserved. # Copyright (c) 2018 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import glob import glob
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""vLLM: a high-throughput and memory-efficient inference engine for LLMs""" """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
# The version.py should be independent library, and we always import the # The version.py should be independent library, and we always import the
# version library first. Such assumption is critical for some customization. # version library first. Such assumption is critical for some customization.
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib import contextlib
import importlib import importlib
...@@ -281,6 +282,45 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, ...@@ -281,6 +282,45 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon) torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
def apply_repetition_penalties_torch(
logits: torch.Tensor, prompt_mask: torch.Tensor,
output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
1, logits.size(1))
# If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
penalties = torch.where(prompt_mask | output_mask, repetition_penalties,
1.0)
# If logits are positive, divide by penalty, otherwise multiply by penalty.
scaling = torch.where(logits > 0, 1.0 / penalties, penalties)
logits *= scaling
def apply_repetition_penalties_cuda(
logits: torch.Tensor, prompt_mask: torch.Tensor,
output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
torch.ops._C.apply_repetition_penalties_(logits, prompt_mask, output_mask,
repetition_penalties)
def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
output_mask: torch.Tensor,
repetition_penalties: torch.Tensor) -> None:
"""Apply repetition penalties to logits in-place.
Args:
logits: The logits tensor of shape [num_seqs, vocab_size].
prompt_mask: A boolean tensor indicating which tokens appear in the prompt.
output_mask: A boolean tensor indicating which tokens appear in the output.
repetition_penalties: The repetition penalties of shape (num_seqs, ).
"""
if current_platform.is_cuda() and logits.is_contiguous():
apply_repetition_penalties_cuda(logits, prompt_mask, output_mask,
repetition_penalties)
else:
apply_repetition_penalties_torch(logits, prompt_mask, output_mask,
repetition_penalties)
def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int, def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
input_tokens: torch.Tensor, input_tokens: torch.Tensor,
sampled_token_ids: torch.Tensor, sampled_token_ids: torch.Tensor,
...@@ -811,11 +851,16 @@ def cutlass_scaled_sparse_mm( ...@@ -811,11 +851,16 @@ def cutlass_scaled_sparse_mm(
return out return out
def get_cutlass_moe_mm_data( def get_cutlass_moe_mm_data(topk_ids: torch.Tensor,
topk_ids: torch.Tensor, expert_offsets: torch.Tensor, expert_offsets: torch.Tensor,
problem_sizes1: torch.Tensor, problem_sizes2: torch.Tensor, problem_sizes1: torch.Tensor,
input_permutation: torch.Tensor, output_permutation: torch.Tensor, problem_sizes2: torch.Tensor,
num_experts: int, n: int, k: int): input_permutation: torch.Tensor,
output_permutation: torch.Tensor,
num_experts: int,
n: int,
k: int,
blockscale_offsets: Optional[torch.Tensor] = None):
""" """
Prepare data necessary to perform CUTLASS grouped matrix multiplications Prepare data necessary to perform CUTLASS grouped matrix multiplications
used in CUTLASS-based fused MoE. used in CUTLASS-based fused MoE.
...@@ -833,19 +878,63 @@ def get_cutlass_moe_mm_data( ...@@ -833,19 +878,63 @@ def get_cutlass_moe_mm_data(
before executing the MMs. before executing the MMs.
- output_permutation: Permutation that must be used to shuffle the output - output_permutation: Permutation that must be used to shuffle the output
after executing the MMs. after executing the MMs.
- blockscale_offsets: Optional argument passed for fp4 moe. Indices that
mark at which block scale index each expert begins
its computation. The number of block scale rows
computed with expert E is blockscale_offsets[E + 1] -
blockscale_offsets[E]
""" """
return torch.ops._C.get_cutlass_moe_mm_data(topk_ids, expert_offsets, return torch.ops._C.get_cutlass_moe_mm_data(topk_ids, expert_offsets,
problem_sizes1, problem_sizes2, problem_sizes1, problem_sizes2,
input_permutation, input_permutation,
output_permutation, output_permutation,
num_experts, n, k) num_experts, n, k,
blockscale_offsets)
def shuffle_rows(input_tensor: torch.Tensor, dst2src_map: torch.Tensor):
"""
Shuffle and expand the input tensor according to the dst2src_map and store the result in output_tensor.
This is used in MoE to permute the input tensor before performing grouped matrix multiplications.
"""
num_tokens_permuted = dst2src_map.shape[0]
output_tensor = torch.empty((num_tokens_permuted, input_tensor.shape[1]),
device=input_tensor.device,
dtype=input_tensor.dtype)
torch.ops._moe_C.shuffle_rows(input_tensor, dst2src_map, output_tensor)
return output_tensor
def get_cutlass_pplx_moe_mm_data(expert_offsets: torch.Tensor,
problem_sizes1: torch.Tensor,
problem_sizes2: torch.Tensor,
expert_num_tokens: torch.Tensor,
num_local_experts: int, padded_m: int, n: int,
k: int):
"""
Prepare data necessary to perform CUTLASS grouped matrix multiplications
used in CUTLASS-based fused MoE.
The function takes in expert_num_tokens (token count per expert) and
non_zero_expert_idxs (consecutive indices of experts with non-zero token
counts) and uses them to compute:
- expert_offsets: Indices that mark at which token index each expert begins
its computation.
- problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
multiplication in two grouped MMs used in
the fused MoE operation.
"""
return torch.ops._C.get_cutlass_pplx_moe_mm_data(
expert_offsets, problem_sizes1, problem_sizes2, expert_num_tokens,
num_local_experts, padded_m, n, k)
def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor, def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
b_tensors: torch.Tensor, a_scales: torch.Tensor, b_tensors: torch.Tensor, a_scales: torch.Tensor,
b_scales: torch.Tensor, expert_offsets: torch.Tensor, b_scales: torch.Tensor, expert_offsets: torch.Tensor,
problem_sizes: torch.Tensor, a_strides: torch.Tensor, problem_sizes: torch.Tensor, a_strides: torch.Tensor,
b_strides: torch.Tensor, c_strides: torch.Tensor): b_strides: torch.Tensor, c_strides: torch.Tensor,
per_act_token: bool, per_out_ch: bool):
""" """
A single grouped matrix multiplication used in CUTLASS-based fused MoE. A single grouped matrix multiplication used in CUTLASS-based fused MoE.
The function executes fp8-quantized OUT = AB matrix multiplication. The function executes fp8-quantized OUT = AB matrix multiplication.
...@@ -860,7 +949,7 @@ def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor, ...@@ -860,7 +949,7 @@ def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
return torch.ops._C.cutlass_moe_mm(out_tensors, a_tensors, b_tensors, return torch.ops._C.cutlass_moe_mm(out_tensors, a_tensors, b_tensors,
a_scales, b_scales, expert_offsets, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, problem_sizes, a_strides, b_strides,
c_strides) c_strides, per_act_token, per_out_ch)
def cutlass_fp4_moe_mm(a_tensors: torch.Tensor, b_tensors: torch.Tensor, def cutlass_fp4_moe_mm(a_tensors: torch.Tensor, b_tensors: torch.Tensor,
...@@ -1090,14 +1179,12 @@ def scaled_fp4_experts_quant( ...@@ -1090,14 +1179,12 @@ def scaled_fp4_experts_quant(
expert_offsets: torch.Tensor, expert_offsets: torch.Tensor,
blockscale_offsets: torch.Tensor, blockscale_offsets: torch.Tensor,
topk: int, topk: int,
expert_map: Optional[torch.Tensor] = None,
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
""" """
Quantize input tensor to FP4 and return quantized tensor and scale, for Quantize input tensor to FP4 and return quantized tensor and scale, for
packed MoE Inputs. packed MoE Inputs.
Args: Args:
input: The input tensor to be quantized to FP4 input_tensor: The input tensor to be quantized to FP4
expert_map: The expert map tensor
input_global_scale: A scalar scaling factor for the entire tensor. input_global_scale: A scalar scaling factor for the entire tensor.
expert_offsets: The expert offsets tensor expert_offsets: The expert offsets tensor
blockscale_offsets: The blockscale offsets tensor blockscale_offsets: The blockscale offsets tensor
...@@ -1109,14 +1196,13 @@ def scaled_fp4_experts_quant( ...@@ -1109,14 +1196,13 @@ def scaled_fp4_experts_quant(
assert input_tensor.ndim == 2, ( assert input_tensor.ndim == 2, (
f'input.ndim needs to be == 2, but got {input_tensor.ndim}.') f'input.ndim needs to be == 2, but got {input_tensor.ndim}.')
input_tensor = input_tensor[
expert_map] if expert_map is not None else input_tensor
m_numtopk, k = input_tensor.shape
# Control the maximum number of tokens per expert supported by the # Control the maximum number of tokens per expert supported by the
# NVFP4 MoE Expert Quantization. This is used to prevent the kernel # NVFP4 MoE Expert Quantization. This is used to prevent the kernel
# from running out of memory. This value can also be increased to support # from running out of memory. This value can also be increased to support
# larger models. # larger models.
MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE
m_numtopk, k = input_tensor.shape
assert (m_numtopk <= MAX_TOKENS_PER_EXPERT * topk), ( assert (m_numtopk <= MAX_TOKENS_PER_EXPERT * topk), (
f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT(" f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT("
f"{MAX_TOKENS_PER_EXPERT})" f"{MAX_TOKENS_PER_EXPERT})"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment