Merge remote-tracking branch 'origin/v0.7.2-dev' into v0.7.2_zero_overhead

ca4ec0ce · lizhigong · 0be169ad · ae0ed592 · ca4ec0ce · ca4ec0ce
Commit ca4ec0ce authored Mar 25, 2025 by lizhigong
20 changed files
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
--- a/vllm/benchmarks/backend_request_func.py
+++ b/vllm/benchmarks/backend_request_func.py
--- a/vllm/benchmarks/benchmark_serving.py
+++ b/vllm/benchmarks/benchmark_serving.py
--- a/vllm/benchmarks/benchmark_throughput.py
+++ b/vllm/benchmarks/benchmark_throughput.py
@@ -5,6 +5,7 @@ import dataclasses
 import json
 import random
 import time
+from pathlib import Path
 from functools import cache
 from typing import Dict, List, Optional, Tuple

@@ -215,12 +216,34 @@ def run_vllm(
    use_beam_search = False

    if not use_beam_search:
-        start = time.perf_counter()
-        llm.generate(prompts,
-                     sampling_params,
-                     lora_request=lora_requests,
-                     use_tqdm=True)
-        end = time.perf_counter()
+        if args.profile:
+            profile_dir = args.profile_result_dir
+            if not profile_dir:
+                profile_dir = Path(
+                    "."
+                ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
+            print(f"Profiling (results will be saved to '{profile_dir}')...")
+            with torch.profiler.profile(
+                        activities=[torch.profiler.ProfilerActivity.CPU,
+                                    torch.profiler.ProfilerActivity.CUDA,
+                        ],record_shapes=True,
+                        on_trace_ready=torch.profiler.tensorboard_trace_handler(str(profile_dir))
+                        ) as prof:
+                start = time.perf_counter()
+                llm.generate(prompts,
+                        sampling_params,
+                        lora_request=lora_requests,
+                        use_tqdm=True)
+                end = time.perf_counter()
+            print('Prepare time report')
+            print(prof.key_averages(group_by_input_shape=True).table(sort_by="self_cuda_time_total", row_limit=-1))
+        else:
+            start = time.perf_counter()
+            llm.generate(prompts,
+                        sampling_params,
+                        lora_request=lora_requests,
+                        use_tqdm=True)
+            end = time.perf_counter()  
    else:
        assert lora_requests is None, "BeamSearch API does not support LoRA"
        prompts = [request.prompt for request in requests]
@@ -498,6 +521,16 @@ if __name__ == "__main__":
                        type=int,
                        default=None,
                        help="Maximum batch size for HF backend.")
+    parser.add_argument(
+        '--profile',
+        action='store_true',
+        help='profile the generation process of a single batch')
+    parser.add_argument(
+        '--profile-result-dir',
+        type=str,
+        default=None,
+        help=('path to save the pytorch profiler output. Can be visualized '
+              'with ui.perfetto.dev or Tensorboard.'))
    parser.add_argument(
        '--output-json',
        type=str,

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -15,6 +15,7 @@ if TYPE_CHECKING:
    VLLM_NCCL_SO_PATH: Optional[str] = None
    LD_LIBRARY_PATH: Optional[str] = None
    VLLM_USE_TRITON_FLASH_ATTN: bool = False
+    VLLM_USE_TRITON_OPT_MLA: bool = False
    VLLM_USE_OPT_OP: bool = False
    VLLM_USE_TC_PAGED_ATTN: bool = False
    VLLM_USE_PA_PRINT_PARAM: bool = False 
@@ -564,6 +565,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # If set, vLLM will disable the MLA attention optimizations.
    "VLLM_MLA_DISABLE":
    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
+    
+    # If set, vLLM will use optimized MLA attention optimizations.
+    "VLLM_USE_TRITON_OPT_MLA":
+    lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),

    # Flag that can control whether or not we perform matrix-absorption for MLA
    # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the

--- a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=BW200_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=DCU_K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=DCU_K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=K100_AI,dtype=int4_w4a16.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=K100_AI,dtype=int4_w4a16.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=K100_AI,dtype=int4_w4a16_120.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=K100_AI,dtype=int4_w4a16_120.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=BW200_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=DCU_K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=DCU_K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=K100_AI,dtype=int4_w4a16.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=K100_AI,dtype=int4_w4a16.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=K100_AI,dtype=int4_w4a16_120.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=K100_AI,dtype=int4_w4a16_120.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=BW200_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=DCU_K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=DCU_K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=256,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=256,device_name=BW200_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=256,device_name=DCU_K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=256,device_name=DCU_K100_AI_nn.json