Unverified Commit 774c5fde authored by Divakar Verma's avatar Divakar Verma Committed by GitHub
Browse files

[V1] fix torch profiling for V1 offline scenarios (#18445)


Signed-off-by: default avatarDivakar Verma <divakar.verma@amd.com>
parent 9a21e331
...@@ -6,13 +6,12 @@ import dataclasses ...@@ -6,13 +6,12 @@ import dataclasses
import json import json
import os import os
import time import time
from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
import numpy as np import numpy as np
import torch
from tqdm import tqdm from tqdm import tqdm
import vllm.envs as envs
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
...@@ -80,17 +79,9 @@ def main(args: argparse.Namespace): ...@@ -80,17 +79,9 @@ def main(args: argparse.Namespace):
def run_to_completion(profile_dir: Optional[str] = None): def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir: if profile_dir:
with torch.profiler.profile( llm.start_profile()
activities=[ llm_generate()
torch.profiler.ProfilerActivity.CPU, llm.stop_profile()
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir)
),
) as p:
llm_generate()
print(p.key_averages().table(sort_by="self_cuda_time_total"))
else: else:
start_time = time.perf_counter() start_time = time.perf_counter()
llm_generate() llm_generate()
...@@ -103,11 +94,7 @@ def main(args: argparse.Namespace): ...@@ -103,11 +94,7 @@ def main(args: argparse.Namespace):
run_to_completion(profile_dir=None) run_to_completion(profile_dir=None)
if args.profile: if args.profile:
profile_dir = args.profile_result_dir profile_dir = envs.VLLM_TORCH_PROFILER_DIR
if not profile_dir:
profile_dir = (
Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
)
print(f"Profiling (results will be saved to '{profile_dir}')...") print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir) run_to_completion(profile_dir=profile_dir)
return return
...@@ -164,15 +151,6 @@ if __name__ == "__main__": ...@@ -164,15 +151,6 @@ if __name__ == "__main__":
action="store_true", action="store_true",
help="profile the generation process of a single batch", help="profile the generation process of a single batch",
) )
parser.add_argument(
"--profile-result-dir",
type=str,
default=None,
help=(
"path to save the pytorch profiler output. Can be visualized "
"with ui.perfetto.dev or Tensorboard."
),
)
parser.add_argument( parser.add_argument(
"--output-json", "--output-json",
type=str, type=str,
...@@ -193,4 +171,9 @@ if __name__ == "__main__": ...@@ -193,4 +171,9 @@ if __name__ == "__main__":
# numbers. We need to disable prefix caching by default. # numbers. We need to disable prefix caching by default.
parser.set_defaults(enable_prefix_caching=False) parser.set_defaults(enable_prefix_caching=False)
args = parser.parse_args() args = parser.parse_args()
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
raise OSError(
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
"Please set it to a valid path to use torch profiler."
)
main(args) main(args)
...@@ -6,13 +6,12 @@ import dataclasses ...@@ -6,13 +6,12 @@ import dataclasses
import json import json
import os import os
import time import time
from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
import numpy as np import numpy as np
import torch
from tqdm import tqdm from tqdm import tqdm
import vllm.envs as envs
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format, from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
write_to_json) write_to_json)
...@@ -59,13 +58,6 @@ def add_cli_args(parser: argparse.ArgumentParser): ...@@ -59,13 +58,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
action="store_true", action="store_true",
help="profile the generation process of a single batch", help="profile the generation process of a single batch",
) )
parser.add_argument(
"--profile-result-dir",
type=str,
default=None,
help=("path to save the pytorch profiler output. Can be visualized "
"with ui.perfetto.dev or Tensorboard."),
)
parser.add_argument( parser.add_argument(
"--output-json", "--output-json",
type=str, type=str,
...@@ -87,7 +79,10 @@ def add_cli_args(parser: argparse.ArgumentParser): ...@@ -87,7 +79,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
raise OSError(
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
"Please set it to a valid path to use torch profiler.")
engine_args = EngineArgs.from_cli_args(args) engine_args = EngineArgs.from_cli_args(args)
# NOTE(woosuk): If the request cannot be processed in a single batch, # NOTE(woosuk): If the request cannot be processed in a single batch,
...@@ -131,16 +126,9 @@ def main(args: argparse.Namespace): ...@@ -131,16 +126,9 @@ def main(args: argparse.Namespace):
def run_to_completion(profile_dir: Optional[str] = None): def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir: if profile_dir:
with torch.profiler.profile( llm.start_profile()
activities=[ llm_generate()
torch.profiler.ProfilerActivity.CPU, llm.stop_profile()
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir)),
) as p:
llm_generate()
print(p.key_averages().table(sort_by="self_cuda_time_total"))
else: else:
start_time = time.perf_counter() start_time = time.perf_counter()
llm_generate() llm_generate()
...@@ -153,10 +141,7 @@ def main(args: argparse.Namespace): ...@@ -153,10 +141,7 @@ def main(args: argparse.Namespace):
run_to_completion(profile_dir=None) run_to_completion(profile_dir=None)
if args.profile: if args.profile:
profile_dir = args.profile_result_dir profile_dir = envs.VLLM_TORCH_PROFILER_DIR
if not profile_dir:
profile_dir = (Path(".") / "vllm_benchmark_result" /
f"latency_result_{time.time()}")
print(f"Profiling (results will be saved to '{profile_dir}')...") print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir) run_to_completion(profile_dir=profile_dir)
return return
......
...@@ -292,6 +292,8 @@ class Worker(WorkerBase): ...@@ -292,6 +292,8 @@ class Worker(WorkerBase):
self.profiler.start() self.profiler.start()
else: else:
self.profiler.stop() self.profiler.stop()
print(self.profiler.key_averages().table(
sort_by="self_cuda_time_total"))
def execute_dummy_batch(self) -> None: def execute_dummy_batch(self) -> None:
self.model_runner._dummy_run(1) self.model_runner._dummy_run(1)
......
...@@ -128,6 +128,8 @@ class Worker(LocalOrDistributedWorkerBase): ...@@ -128,6 +128,8 @@ class Worker(LocalOrDistributedWorkerBase):
if self.profiler is None: if self.profiler is None:
raise RuntimeError("Profiler is not enabled.") raise RuntimeError("Profiler is not enabled.")
self.profiler.stop() self.profiler.stop()
print(
self.profiler.key_averages().table(sort_by="self_cuda_time_total"))
def sleep(self, level: int = 1) -> None: def sleep(self, level: int = 1) -> None:
free_bytes_before_sleep = torch.cuda.mem_get_info()[0] free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment