Commit ca4ec0ce authored by lizhigong's avatar lizhigong
Browse files

Merge remote-tracking branch 'origin/v0.7.2-dev' into v0.7.2_zero_overhead

parents 0be169ad ae0ed592
This diff is collapsed.
This diff is collapsed.
...@@ -5,6 +5,7 @@ import dataclasses ...@@ -5,6 +5,7 @@ import dataclasses
import json import json
import random import random
import time import time
from pathlib import Path
from functools import cache from functools import cache
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
...@@ -215,6 +216,28 @@ def run_vllm( ...@@ -215,6 +216,28 @@ def run_vllm(
use_beam_search = False use_beam_search = False
if not use_beam_search: if not use_beam_search:
if args.profile:
profile_dir = args.profile_result_dir
if not profile_dir:
profile_dir = Path(
"."
) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
print(f"Profiling (results will be saved to '{profile_dir}')...")
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],record_shapes=True,
on_trace_ready=torch.profiler.tensorboard_trace_handler(str(profile_dir))
) as prof:
start = time.perf_counter()
llm.generate(prompts,
sampling_params,
lora_request=lora_requests,
use_tqdm=True)
end = time.perf_counter()
print('Prepare time report')
print(prof.key_averages(group_by_input_shape=True).table(sort_by="self_cuda_time_total", row_limit=-1))
else:
start = time.perf_counter() start = time.perf_counter()
llm.generate(prompts, llm.generate(prompts,
sampling_params, sampling_params,
...@@ -498,6 +521,16 @@ if __name__ == "__main__": ...@@ -498,6 +521,16 @@ if __name__ == "__main__":
type=int, type=int,
default=None, default=None,
help="Maximum batch size for HF backend.") help="Maximum batch size for HF backend.")
parser.add_argument(
'--profile',
action='store_true',
help='profile the generation process of a single batch')
parser.add_argument(
'--profile-result-dir',
type=str,
default=None,
help=('path to save the pytorch profiler output. Can be visualized '
'with ui.perfetto.dev or Tensorboard.'))
parser.add_argument( parser.add_argument(
'--output-json', '--output-json',
type=str, type=str,
......
...@@ -15,6 +15,7 @@ if TYPE_CHECKING: ...@@ -15,6 +15,7 @@ if TYPE_CHECKING:
VLLM_NCCL_SO_PATH: Optional[str] = None VLLM_NCCL_SO_PATH: Optional[str] = None
LD_LIBRARY_PATH: Optional[str] = None LD_LIBRARY_PATH: Optional[str] = None
VLLM_USE_TRITON_FLASH_ATTN: bool = False VLLM_USE_TRITON_FLASH_ATTN: bool = False
VLLM_USE_TRITON_OPT_MLA: bool = False
VLLM_USE_OPT_OP: bool = False VLLM_USE_OPT_OP: bool = False
VLLM_USE_TC_PAGED_ATTN: bool = False VLLM_USE_TC_PAGED_ATTN: bool = False
VLLM_USE_PA_PRINT_PARAM: bool = False VLLM_USE_PA_PRINT_PARAM: bool = False
...@@ -565,6 +566,10 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -565,6 +566,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_MLA_DISABLE": "VLLM_MLA_DISABLE":
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
# If set, vLLM will use optimized MLA attention optimizations.
"VLLM_USE_TRITON_OPT_MLA":
lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
# Flag that can control whether or not we perform matrix-absorption for MLA # Flag that can control whether or not we perform matrix-absorption for MLA
# decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the
# matrices reduces the runtime FLOPs needed to compute MLA but requires # matrices reduces the runtime FLOPs needed to compute MLA but requires
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment