Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4816d20a
Unverified
Commit
4816d20a
authored
Dec 12, 2024
by
Roger Wang
Committed by
GitHub
Dec 12, 2024
Browse files
[V1] Fix torch profiling for offline inference (#11125)
Signed-off-by:
Roger Wang
<
ywang@roblox.com
>
parent
85362f02
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
21 additions
and
14 deletions
+21
-14
examples/offline_inference_with_profiler.py
examples/offline_inference_with_profiler.py
+19
-12
vllm/v1/engine/core_client.py
vllm/v1/engine/core_client.py
+2
-2
No files found.
examples/offline_inference_with_profiler.py
View file @
4816d20a
import
os
import
os
import
time
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
...
@@ -15,19 +16,25 @@ prompts = [
...
@@ -15,19 +16,25 @@ prompts = [
# Create a sampling params object.
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Create an LLM.
if
__name__
==
"__main__"
:
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
tensor_parallel_size
=
1
)
llm
.
start_profile
()
# Create an LLM.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
tensor_parallel_size
=
1
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
llm
.
start_profile
()
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
llm
.
stop_profile
()
# Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
llm
.
stop_profile
()
for
output
in
outputs
:
prompt
=
output
.
prompt
# Print the outputs.
generated_text
=
output
.
outputs
[
0
].
text
for
output
in
outputs
:
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
# Add a buffer to wait for profiler in the background process
# (in case MP is on) to finish writing profiling output.
time
.
sleep
(
10
)
vllm/v1/engine/core_client.py
View file @
4816d20a
...
@@ -105,7 +105,7 @@ class InprocClient(EngineCoreClient):
...
@@ -105,7 +105,7 @@ class InprocClient(EngineCoreClient):
def
__del__
(
self
):
def
__del__
(
self
):
self
.
shutdown
()
self
.
shutdown
()
async
def
profile
(
self
,
is_start
=
True
)
->
None
:
def
profile
(
self
,
is_start
=
True
)
->
None
:
self
.
engine_core
.
profile
(
is_start
)
self
.
engine_core
.
profile
(
is_start
)
...
@@ -212,7 +212,7 @@ class SyncMPClient(MPClient):
...
@@ -212,7 +212,7 @@ class SyncMPClient(MPClient):
def
abort_requests
(
self
,
request_ids
:
List
[
str
])
->
None
:
def
abort_requests
(
self
,
request_ids
:
List
[
str
])
->
None
:
self
.
_send_input
(
EngineCoreRequestType
.
ABORT
,
request_ids
)
self
.
_send_input
(
EngineCoreRequestType
.
ABORT
,
request_ids
)
async
def
profile
(
self
,
is_start
=
True
)
->
None
:
def
profile
(
self
,
is_start
=
True
)
->
None
:
self
.
_send_input
(
EngineCoreRequestType
.
PROFILE
,
self
.
_send_input
(
EngineCoreRequestType
.
PROFILE
,
EngineCoreProfile
(
is_start
))
EngineCoreProfile
(
is_start
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment