[V1] Fix torch profiling for offline inference (#11125)

Signed-off-by: Roger Wang <ywang@roblox.com>

[V1] Fix torch profiling for offline inference (#11125)
Signed-off-by: Roger Wang <ywang@roblox.com>
4816d20a · Roger Wang · GitHub · 85362f02 · 4816d20a · 4816d20a
Unverified Commit 4816d20a authored Dec 12, 2024 by Roger Wang Committed by GitHub Dec 12, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 14 deletions

examples/offline_inference_with_profiler.py examples/offline_inference_with_profiler.py +19 -12

vllm/v1/engine/core_client.py vllm/v1/engine/core_client.py +2 -2

No files found.
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
 import os
+import time
 from vllm import LLM, SamplingParams
@@ -15,19 +16,25 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-# Create an LLM.
+if __name__ == "__main__":
-llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
-llm.start_profile()
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
+    llm.start_profile()
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-llm.stop_profile()
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
+    llm.stop_profile()
-for output in outputs:
-    prompt = output.prompt
+    # Print the outputs.
-    generated_text = output.outputs[0].text
+    for output in outputs:
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    # Add a buffer to wait for profiler in the background process
+    # (in case MP is on) to finish writing profiling output.
+    time.sleep(10)
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -105,7 +105,7 @@ class InprocClient(EngineCoreClient):
    def __del__(self):
        self.shutdown()
-    async def profile(self, is_start=True) -> None:
+    def profile(self, is_start=True) -> None:
        self.engine_core.profile(is_start)
@@ -212,7 +212,7 @@ class SyncMPClient(MPClient):
    def abort_requests(self, request_ids: List[str]) -> None:
        self._send_input(EngineCoreRequestType.ABORT, request_ids)
-    async def profile(self, is_start=True) -> None:
+    def profile(self, is_start=True) -> None:
        self._send_input(EngineCoreRequestType.PROFILE,
                         EngineCoreProfile(is_start))