"vllm/executor/mp_distributed_executor.py" did not exist on "c5832d2ae9431a1672d547c232ec46b1a9051ff0"
Unverified Commit 975676d1 authored by Benjamin Chislett's avatar Benjamin Chislett Committed by GitHub
Browse files

[Feat] Drop-in Torch CUDA Profiler (#27841)


Signed-off-by: default avatarBenjamin Chislett <bchislett@nvidia.com>
parent 77d702a2
...@@ -39,7 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline ...@@ -39,7 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline
```bash ```bash
VLLM_TORCH_PROFILER_DIR=./vllm_profile \ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
vllm serve meta-llama/Meta-Llama-3-70B vllm serve meta-llama/Llama-3.1-8B-Instruct
``` ```
vllm bench command: vllm bench command:
...@@ -47,7 +47,7 @@ vllm bench command: ...@@ -47,7 +47,7 @@ vllm bench command:
```bash ```bash
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--model meta-llama/Meta-Llama-3-70B \ --model meta-llama/Llama-3.1-8B-Instruct \
--dataset-name sharegpt \ --dataset-name sharegpt \
--dataset-path sharegpt.json \ --dataset-path sharegpt.json \
--profile \ --profile \
...@@ -70,18 +70,21 @@ apt update ...@@ -70,18 +70,21 @@ apt update
apt install nsight-systems-cli apt install nsight-systems-cli
``` ```
### Example commands and usage !!! tip
When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues).
When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues). The Nsight Systems profiler can be launched with `nsys profile ...`, with a few recommended flags for vLLM: `--trace-fork-before-exec=true --cuda-graph-trace=node`.
### Example commands and usage
#### Offline Inference #### Offline Inference
For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference. For basic usage, you can just append the profiling command before any existing script you would run for offline inference.
The following is an example using the `vllm bench latency` script: The following is an example using the `vllm bench latency` script:
```bash ```bash
nsys profile -o report.nsys-rep \ nsys profile \
--trace-fork-before-exec=true \ --trace-fork-before-exec=true \
--cuda-graph-trace=node \ --cuda-graph-trace=node \
vllm bench latency \ vllm bench latency \
...@@ -95,40 +98,29 @@ vllm bench latency \ ...@@ -95,40 +98,29 @@ vllm bench latency \
#### OpenAI Server #### OpenAI Server
To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, however you must specify `--delay XX --duration YY` parameters according to the needs of your benchmark. After the duration time has been used up, the server will be killed. To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, but you will need to specify a few other arguments to enable dynamic capture similarly to the Torch Profiler:
```bash ```bash
# server # server
nsys profile -o report.nsys-rep \ VLLM_TORCH_CUDA_PROFILE=1 \
nsys profile \
--trace-fork-before-exec=true \ --trace-fork-before-exec=true \
--cuda-graph-trace=node \ --cuda-graph-trace=node \
--delay 30 \ --capture-range=cudaProfilerApi \
--duration 60 \ --capture-range-end repeat \
vllm serve meta-llama/Llama-3.1-8B-Instruct vllm serve meta-llama/Llama-3.1-8B-Instruct
# client # client
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--num-prompts 1 \ --dataset-name sharegpt \
--dataset-name random \ --dataset-path sharegpt.json \
--random-input 1024 \ --profile \
--random-output 512 --num-prompts 2
```
In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
```bash
nsys sessions list
```
to get the session id in the form of `profile-XXXXX`, then run:
```bash
nsys stop --session=profile-XXXXX
``` ```
to manually kill the profiler and generate your `nsys-rep` report. With `--profile`, vLLM will capture a profile for each run of `vllm bench serve`. Once the server is killed, the profiles will all be saved.
#### Analysis #### Analysis
......
...@@ -1280,10 +1280,16 @@ async def invocations(raw_request: Request): ...@@ -1280,10 +1280,16 @@ async def invocations(raw_request: Request):
if envs.VLLM_TORCH_PROFILER_DIR: if envs.VLLM_TORCH_PROFILER_DIR:
logger.warning( logger.warning_once(
"Torch Profiler is enabled in the API server. This should ONLY be " "Torch Profiler is enabled in the API server. This should ONLY be "
"used for local development!" "used for local development!"
) )
elif envs.VLLM_TORCH_CUDA_PROFILE:
logger.warning_once(
"CUDA Profiler is enabled in the API server. This should ONLY be "
"used for local development!"
)
if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE:
@router.post("/start_profile") @router.post("/start_profile")
async def start_profile(raw_request: Request): async def start_profile(raw_request: Request):
......
...@@ -87,6 +87,7 @@ if TYPE_CHECKING: ...@@ -87,6 +87,7 @@ if TYPE_CHECKING:
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
VLLM_PLUGINS: list[str] | None = None VLLM_PLUGINS: list[str] | None = None
VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
VLLM_TORCH_CUDA_PROFILE: bool = False
VLLM_TORCH_PROFILER_DIR: str | None = None VLLM_TORCH_PROFILER_DIR: str | None = None
VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
...@@ -815,6 +816,11 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -815,6 +816,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv( "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
"VLLM_LORA_RESOLVER_CACHE_DIR", None "VLLM_LORA_RESOLVER_CACHE_DIR", None
), ),
# Enables torch CUDA profiling if set.
# On NVIDIA GPUs, this will start/stop cudaProfilerApi when triggered.
"VLLM_TORCH_CUDA_PROFILE": lambda: bool(
os.getenv("VLLM_TORCH_CUDA_PROFILE", "0") != "0"
),
# Enables torch profiler if set. # Enables torch profiler if set.
# Both AsyncLLM's CPU traces as well as workers' # Both AsyncLLM's CPU traces as well as workers'
# traces (CPU & GPU) will be saved under this directory. # traces (CPU & GPU) will be saved under this directory.
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.logger import init_logger
logger = init_logger(__name__)
class CudaProfilerWrapper:
def __init__(self) -> None:
self._profiler_running = False
# Note: lazy import to avoid dependency issues if CUDA is not available.
import torch.cuda.profiler as cuda_profiler
self._cuda_profiler = cuda_profiler
def start(self) -> None:
try:
self._cuda_profiler.start()
self._profiler_running = True
logger.info_once("Started CUDA profiler")
except Exception as e:
logger.warning_once("Failed to start CUDA profiler: %s", e)
def stop(self) -> None:
if self._profiler_running:
try:
self._cuda_profiler.stop()
logger.info_once("Stopped CUDA profiler")
except Exception as e:
logger.warning_once("Failed to stop CUDA profiler: %s", e)
finally:
self._profiler_running = False
def shutdown(self) -> None:
"""Ensure profiler is stopped when shutting down."""
self.stop()
...@@ -35,6 +35,7 @@ from vllm.model_executor import set_random_seed ...@@ -35,6 +35,7 @@ from vllm.model_executor import set_random_seed
from vllm.model_executor.models.interfaces import is_mixture_of_experts from vllm.model_executor.models.interfaces import is_mixture_of_experts
from vllm.model_executor.warmup.kernel_warmup import kernel_warmup from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.profiler.gpu_profiler import CudaProfilerWrapper
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tasks import SupportedTask from vllm.tasks import SupportedTask
from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_constants import GiB_bytes
...@@ -116,6 +117,8 @@ class Worker(WorkerBase): ...@@ -116,6 +117,8 @@ class Worker(WorkerBase):
torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
), ),
) )
elif envs.VLLM_TORCH_CUDA_PROFILE:
self.profiler = CudaProfilerWrapper()
else: else:
self.profiler = None self.profiler = None
...@@ -593,7 +596,10 @@ class Worker(WorkerBase): ...@@ -593,7 +596,10 @@ class Worker(WorkerBase):
else: else:
self.profiler.stop() self.profiler.stop()
# only print profiler results on rank 0 # only print profiler results on rank 0
if self.local_rank == 0: if (
isinstance(self.profiler, torch.profiler.profile)
and self.local_rank == 0
):
print( print(
self.profiler.key_averages().table(sort_by="self_cuda_time_total") self.profiler.key_averages().table(sort_by="self_cuda_time_total")
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment