Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4851c202
Commit
4851c202
authored
Sep 13, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.1' into v0.6.1-dev
parents
9b902f9e
3fd2b0d2
Changes
203
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
22 additions
and
28 deletions
+22
-28
vllm/worker/multi_step_model_runner.py
vllm/worker/multi_step_model_runner.py
+19
-26
vllm/worker/tpu_model_runner.py
vllm/worker/tpu_model_runner.py
+2
-2
vllm/worker/worker.py
vllm/worker/worker.py
+1
-0
No files found.
vllm/worker/multi_step_model_runner.py
View file @
4851c202
...
...
@@ -13,7 +13,6 @@ except ModuleNotFoundError:
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.distributed
import
get_pp_group
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.sampler
import
(
PromptLogprobs
,
SampleLogprobs
,
...
...
@@ -274,12 +273,13 @@ class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]):
self
.
pinned_sampled_token_ids
)
if
model_output
.
pythonized
:
ctx
=
output_proc_callback
.
keywords
[
"ctx"
]
is_async
=
False
is_last_step
=
False
ctx
.
output_queue
.
append
(
([
model_output
.
sampler_output
],
ctx
.
seq_group_metadata_list
,
ctx
.
scheduler_outputs
,
is_async
,
is_last_step
))
ctx
.
append_output
(
outputs
=
[
model_output
.
sampler_output
],
seq_group_metadata_list
=
ctx
.
seq_group_metadata_list
,
scheduler_outputs
=
ctx
.
scheduler_outputs
,
is_async
=
False
,
is_last_step
=
False
)
output_proc_callback
()
else
:
cont
=
False
...
...
@@ -319,12 +319,13 @@ class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]):
if
not
is_last_step
:
ctx
=
output_proc_callback
.
keywords
[
# type: ignore
"ctx"
]
# type: ignore
is_async
=
False
is_last_step
=
False
ctx
.
output_queue
.
append
(
([
output
.
sampler_output
],
ctx
.
seq_group_metadata_list
,
ctx
.
scheduler_outputs
,
is_async
,
is_last_step
))
ctx
.
append_output
(
outputs
=
[
output
.
sampler_output
],
seq_group_metadata_list
=
ctx
.
seq_group_metadata_list
,
scheduler_outputs
=
ctx
.
scheduler_outputs
,
is_async
=
False
,
is_last_step
=
False
)
else
:
outputs
.
append
(
output
.
sampler_output
)
else
:
...
...
@@ -497,19 +498,11 @@ class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]):
attn_metadata
=
frozen_model_input
.
attn_metadata
assert
isinstance
(
attn_metadata
,
FlashAttentionMetadata
)
attn_metadata
.
advance_step
(
num_seqs
,
num_queries
)
# Update GPU tensors
ops
.
advance_step
(
num_seqs
=
num_seqs
,
num_queries
=
num_queries
,
block_size
=
self
.
block_size
,
input_tokens
=
frozen_model_input
.
input_tokens
,
sampled_token_ids
=
model_input
.
cached_outputs
[
-
1
].
sampled_token_ids
,
input_positions
=
frozen_model_input
.
input_positions
,
seq_lens
=
attn_metadata
.
seq_lens_tensor
,
slot_mapping
=
attn_metadata
.
slot_mapping
,
block_tables
=
attn_metadata
.
block_tables
)
attn_metadata
.
advance_step
(
frozen_model_input
,
model_input
.
cached_outputs
[
-
1
].
sampled_token_ids
,
self
.
block_size
,
num_seqs
,
num_queries
)
if
frozen_model_input
.
seq_lens
is
not
None
:
for
i
in
range
(
num_queries
):
...
...
vllm/worker/tpu_model_runner.py
View file @
4851c202
...
...
@@ -11,7 +11,7 @@ import torch_xla.core.xla_model as xm
import
torch_xla.runtime
as
xr
from
vllm.attention
import
AttentionMetadata
,
get_attn_backend
from
vllm.compilation.wrapper
import
TorchCompileWrapperWithCustomDispa
c
ther
from
vllm.compilation.wrapper
import
TorchCompileWrapperWithCustomDispat
c
her
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm.logger
import
init_logger
...
...
@@ -611,7 +611,7 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
return
[
SamplerOutput
(
sampler_outputs
)]
class
ModelWrapper
(
TorchCompileWrapperWithCustomDispa
c
ther
):
class
ModelWrapper
(
TorchCompileWrapperWithCustomDispat
c
her
):
def
__init__
(
self
,
model
:
nn
.
Module
):
self
.
model
=
model
...
...
vllm/worker/worker.py
View file @
4851c202
...
...
@@ -166,6 +166,7 @@ class Worker(LocalOrDistributedWorkerBase):
torch
.
cuda
.
set_device
(
self
.
device
)
_check_if_gpu_supports_dtype
(
self
.
model_config
.
dtype
)
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
self
.
init_gpu_memory
=
torch
.
cuda
.
mem_get_info
()[
0
]
else
:
...
...
Prev
1
…
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment