Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7d224eb2
Commit
7d224eb2
authored
May 08, 2025
by
lizhigong
Browse files
rm debug log
parent
0ecda6d1
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
7 additions
and
19 deletions
+7
-19
vllm/executor/mp_distributed_executor.py
vllm/executor/mp_distributed_executor.py
+7
-7
vllm/model_executor/layers/sampler.py
vllm/model_executor/layers/sampler.py
+0
-1
vllm/spec_decode/spec_decode_worker.py
vllm/spec_decode/spec_decode_worker.py
+0
-2
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+0
-5
vllm/zero_overhead/model_runner.py
vllm/zero_overhead/model_runner.py
+0
-2
vllm/zero_overhead/sampler.py
vllm/zero_overhead/sampler.py
+0
-1
vllm/zero_overhead/spec_decode/spec_decode_worker.py
vllm/zero_overhead/spec_decode/spec_decode_worker.py
+0
-1
No files found.
vllm/executor/mp_distributed_executor.py
View file @
7d224eb2
...
@@ -48,13 +48,13 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase):
...
@@ -48,13 +48,13 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase):
f
"is less than than max local gpu count (
{
cuda_device_count
}
)"
)
f
"is less than than max local gpu count (
{
cuda_device_count
}
)"
)
# Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
# Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
if
"CUDA_VISIBLE_DEVICES"
or
"HIP_VISIBLE_DEVICES"
not
in
os
.
environ
:
#
if "CUDA_VISIBLE_DEVICES" or "HIP_VISIBLE_DEVICES" not in os.environ:
update_environment_variables
({
#
update_environment_variables({
"CUDA_VISIBLE_DEVICES"
:
(
","
.
join
(
map
(
str
,
range
(
world_size
))))
#
"CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
})
#
})
update_environment_variables
({
#
update_environment_variables({
"HIP_VISIBLE_DEVICES"
:
(
","
.
join
(
map
(
str
,
range
(
world_size
))))
#
"HIP_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
})
#
})
def
_init_executor
(
self
)
->
None
:
def
_init_executor
(
self
)
->
None
:
...
...
vllm/model_executor/layers/sampler.py
View file @
7d224eb2
...
@@ -746,7 +746,6 @@ def _sample_with_torch(
...
@@ -746,7 +746,6 @@ def _sample_with_torch(
else
:
else
:
raise
ValueError
(
f
"Unsupported sampling type:
{
sampling_type
}
"
)
raise
ValueError
(
f
"Unsupported sampling type:
{
sampling_type
}
"
)
print
(
'###sampled_token_ids'
,
sampled_token_ids_
)
# Encapsulate arguments for computing Pythonized sampler
# Encapsulate arguments for computing Pythonized sampler
# results, whether deferred or otherwise.
# results, whether deferred or otherwise.
maybe_deferred_args
=
SampleResultArgsType
(
maybe_deferred_args
=
SampleResultArgsType
(
...
...
vllm/spec_decode/spec_decode_worker.py
View file @
7d224eb2
...
@@ -910,7 +910,6 @@ class SpecDecodeWorker(LoRANotSupportedWorkerBase):
...
@@ -910,7 +910,6 @@ class SpecDecodeWorker(LoRANotSupportedWorkerBase):
accepted_token_ids
,
target_logprobs
,
select_indices_list
,
accept_lengths
=
self
.
_verify_tokens
(
accepted_token_ids
,
target_logprobs
,
select_indices_list
,
accept_lengths
=
self
.
_verify_tokens
(
execute_model_req
.
seq_group_metadata_list
,
proposal_scores
,
execute_model_req
.
seq_group_metadata_list
,
proposal_scores
,
proposals
,
execute_model_req
.
num_lookahead_slots
)
proposals
,
execute_model_req
.
num_lookahead_slots
)
print
(
'###accepted_token_ids'
,
accepted_token_ids
)
# move kv_caches of selected tokens to right positions
# move kv_caches of selected tokens to right positions
if
self
.
tree_decoding
:
if
self
.
tree_decoding
:
...
@@ -1341,7 +1340,6 @@ class SpecDecodeWorker(LoRANotSupportedWorkerBase):
...
@@ -1341,7 +1340,6 @@ class SpecDecodeWorker(LoRANotSupportedWorkerBase):
self
.
_maybe_log_stage_times
(
*
stage_times
)
self
.
_maybe_log_stage_times
(
*
stage_times
)
# First `n_prefills` entries will contain prefills SamplerOutput when
# First `n_prefills` entries will contain prefills SamplerOutput when
# chunked prefill is enabled, the rest is decodes in multi-step format.
# chunked prefill is enabled, the rest is decodes in multi-step format.
print
(
'###sampler_output_list'
,
sampler_output_list
)
return
sampler_output_list
return
sampler_output_list
def
_maybe_log_stage_times
(
self
,
average_time_per_proposal_tok_ms
:
float
,
def
_maybe_log_stage_times
(
self
,
average_time_per_proposal_tok_ms
:
float
,
...
...
vllm/worker/model_runner.py
View file @
7d224eb2
...
@@ -902,7 +902,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -902,7 +902,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# Tokens and positions.
# Tokens and positions.
if
cuda_graph_pad_size
:
if
cuda_graph_pad_size
:
input_tokens
.
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
input_tokens
.
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
print
(
'###input_tokens'
,
input_tokens
)
assert
self
.
runner
.
device
is
not
None
assert
self
.
runner
.
device
is
not
None
input_tokens_tensor
=
async_tensor_h2d
(
input_tokens
,
torch
.
long
,
input_tokens_tensor
=
async_tensor_h2d
(
input_tokens
,
torch
.
long
,
self
.
runner
.
device
,
self
.
runner
.
device
,
...
@@ -917,14 +916,12 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -917,14 +916,12 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
for
idx
in
range
(
3
):
for
idx
in
range
(
3
):
mrope_input_positions
[
idx
].
extend
(
mrope_input_positions
[
idx
].
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
print
(
'###mrope_input_positions'
,
mrope_input_positions
)
input_positions_tensor
=
async_tensor_h2d
(
mrope_input_positions
,
input_positions_tensor
=
async_tensor_h2d
(
mrope_input_positions
,
torch
.
long
,
torch
.
long
,
self
.
runner
.
device
,
self
.
runner
.
device
,
self
.
runner
.
pin_memory
)
self
.
runner
.
pin_memory
)
else
:
else
:
input_positions
.
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
input_positions
.
extend
(
itertools
.
repeat
(
0
,
cuda_graph_pad_size
))
print
(
'###input_positions'
,
input_positions
)
input_positions_tensor
=
async_tensor_h2d
(
input_positions
,
input_positions_tensor
=
async_tensor_h2d
(
input_positions
,
torch
.
long
,
torch
.
long
,
self
.
runner
.
device
,
self
.
runner
.
device
,
...
@@ -932,7 +929,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -932,7 +929,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
# Sequence and query lengths.
# Sequence and query lengths.
if
cuda_graph_pad_size
:
if
cuda_graph_pad_size
:
seq_lens
.
extend
(
itertools
.
repeat
(
1
,
cuda_graph_pad_size
))
seq_lens
.
extend
(
itertools
.
repeat
(
1
,
cuda_graph_pad_size
))
print
(
'###seq_lens'
,
seq_lens
)
# Attention metadata.
# Attention metadata.
attn_metadata
=
self
.
attn_metadata_builder
.
build
(
attn_metadata
=
self
.
attn_metadata_builder
.
build
(
...
@@ -1006,7 +1002,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -1006,7 +1002,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
prompt_adapter_mapping
=
prompt_adapter_mapping
,
prompt_adapter_mapping
=
prompt_adapter_mapping
,
prompt_adapter_requests
=
prompt_adapter_requests
)
prompt_adapter_requests
=
prompt_adapter_requests
)
print
(
'###model_input'
,
ret
)
return
ret
return
ret
...
...
vllm/zero_overhead/model_runner.py
View file @
7d224eb2
...
@@ -92,7 +92,6 @@ class ZeroOverheadModelInputForGpuBuilder(ModelInputForGPUBuilder):
...
@@ -92,7 +92,6 @@ class ZeroOverheadModelInputForGpuBuilder(ModelInputForGPUBuilder):
def
build
(
self
)
->
ModelInputForGPU
:
def
build
(
self
)
->
ModelInputForGPU
:
model_input
=
super
().
build
()
model_input
=
super
().
build
()
print
(
'###model_input'
,
model_input
)
last_sampler
=
get_last_sampler
()
last_sampler
=
get_last_sampler
()
spec_step
=
get_spec_step
()
spec_step
=
get_spec_step
()
last_step
=
get_spec_last_step
()
last_step
=
get_spec_last_step
()
...
@@ -167,5 +166,4 @@ class ZeroOverheadModelInputForGpuBuilder(ModelInputForGPUBuilder):
...
@@ -167,5 +166,4 @@ class ZeroOverheadModelInputForGpuBuilder(ModelInputForGPUBuilder):
)
)
print
(
'###zero_model_input'
,
model_input
)
return
model_input
return
model_input
vllm/zero_overhead/sampler.py
View file @
7d224eb2
...
@@ -359,7 +359,6 @@ def _sample_with_torch(
...
@@ -359,7 +359,6 @@ def _sample_with_torch(
sampled_token_ids_tensor
[
long_sample_indices
]
=
\
sampled_token_ids_tensor
[
long_sample_indices
]
=
\
multinomial_samples
[
sampling_type
].
to
(
torch
.
long
)
multinomial_samples
[
sampling_type
].
to
(
torch
.
long
)
print
(
'###sampled_token_ids'
,
last_sampler
.
sampled_token_ids_tensor
)
# Encapsulate arguments for computing Pythonized sampler
# Encapsulate arguments for computing Pythonized sampler
# results, whether deferred or otherwise.
# results, whether deferred or otherwise.
maybe_deferred_args
=
SampleResultArgsType
(
maybe_deferred_args
=
SampleResultArgsType
(
...
...
vllm/zero_overhead/spec_decode/spec_decode_worker.py
View file @
7d224eb2
...
@@ -545,7 +545,6 @@ class ZeroOverheadSpecDecodeWorker(SpecDecodeWorker):
...
@@ -545,7 +545,6 @@ class ZeroOverheadSpecDecodeWorker(SpecDecodeWorker):
self
.
_maybe_log_stage_times
(
*
stage_times
)
self
.
_maybe_log_stage_times
(
*
stage_times
)
# First `n_prefills` entries will contain prefills SamplerOutput when
# First `n_prefills` entries will contain prefills SamplerOutput when
# chunked prefill is enabled, the rest is decodes in multi-step format.
# chunked prefill is enabled, the rest is decodes in multi-step format.
print
(
'###sampler_output_list'
,
sampler_output_list
)
return
sampler_output_list
return
sampler_output_list
def
_track_sequences_with_bonus_tokens
(
def
_track_sequences_with_bonus_tokens
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment