Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
28598f39
Unverified
Commit
28598f39
authored
Nov 22, 2024
by
Russell Bryant
Committed by
GitHub
Nov 22, 2024
Browse files
[Core] remove temporary local variables in LLMEngine.__init__ (#10577)
Signed-off-by:
Russell Bryant
<
rbryant@redhat.com
>
parent
948c8595
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
66 additions
and
77 deletions
+66
-77
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+66
-77
No files found.
vllm/engine/llm_engine.py
View file @
28598f39
...
@@ -231,19 +231,18 @@ class LLMEngine:
...
@@ -231,19 +231,18 @@ class LLMEngine:
use_cached_outputs
:
bool
=
False
,
use_cached_outputs
:
bool
=
False
,
)
->
None
:
)
->
None
:
# TODO: remove the local variables and use self.* throughout the class.
self
.
model_config
=
vllm_config
.
model_config
model_config
=
self
.
model_config
=
vllm_config
.
model_config
self
.
cache_config
=
vllm_config
.
cache_config
cache_config
=
self
.
cache_config
=
vllm_config
.
cache_config
self
.
lora_config
=
vllm_config
.
lora_config
lora_config
=
self
.
lora_config
=
vllm_config
.
lora_config
self
.
parallel_config
=
vllm_config
.
parallel_config
parallel_config
=
self
.
parallel_config
=
vllm_config
.
parallel_config
self
.
scheduler_config
=
vllm_config
.
scheduler_config
scheduler_config
=
self
.
scheduler_config
=
vllm_config
.
scheduler_config
self
.
device_config
=
vllm_config
.
device_config
device_config
=
self
.
device_config
=
vllm_config
.
device_config
self
.
speculative_config
=
vllm_config
.
speculative_config
# noqa
speculative_config
=
self
.
speculative_config
=
vllm_config
.
speculative_config
# noqa
self
.
load_config
=
vllm_config
.
load_config
load_config
=
self
.
load_config
=
vllm_config
.
load_config
self
.
decoding_config
=
vllm_config
.
decoding_config
or
DecodingConfig
(
# noqa
decoding_config
=
self
.
decoding_config
=
vllm_config
.
decoding_config
or
DecodingConfig
(
# noqa
)
)
prompt_adapter_config
=
self
.
prompt_adapter_config
=
vllm_config
.
prompt_adapter_config
# noqa
self
.
prompt_adapter_config
=
vllm_config
.
prompt_adapter_config
# noqa
observability_config
=
self
.
observability_config
=
vllm_config
.
observability_config
or
ObservabilityConfig
(
# noqa
self
.
observability_config
=
vllm_config
.
observability_config
or
ObservabilityConfig
(
# noqa
)
)
logger
.
info
(
logger
.
info
(
...
@@ -265,54 +264,43 @@ class LLMEngine:
...
@@ -265,54 +264,43 @@ class LLMEngine:
"mm_processor_kwargs=%s, pooler_config=%r,"
"mm_processor_kwargs=%s, pooler_config=%r,"
"compilation_config=%r"
,
"compilation_config=%r"
,
VLLM_VERSION
,
VLLM_VERSION
,
model_config
.
model
,
self
.
model_config
.
model
,
speculative_config
,
self
.
speculative_config
,
model_config
.
tokenizer
,
self
.
model_config
.
tokenizer
,
model_config
.
skip_tokenizer_init
,
self
.
model_config
.
skip_tokenizer_init
,
model_config
.
tokenizer_mode
,
self
.
model_config
.
tokenizer_mode
,
model_config
.
revision
,
self
.
model_config
.
revision
,
model_config
.
override_neuron_config
,
self
.
model_config
.
override_neuron_config
,
model_config
.
tokenizer_revision
,
self
.
model_config
.
tokenizer_revision
,
model_config
.
trust_remote_code
,
self
.
model_config
.
trust_remote_code
,
model_config
.
dtype
,
self
.
model_config
.
dtype
,
model_config
.
max_model_len
,
self
.
model_config
.
max_model_len
,
load_config
.
download_dir
,
self
.
load_config
.
download_dir
,
load_config
.
load_format
,
self
.
load_config
.
load_format
,
parallel_config
.
tensor_parallel_size
,
self
.
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
,
self
.
parallel_config
.
pipeline_parallel_size
,
parallel_config
.
disable_custom_all_reduce
,
self
.
parallel_config
.
disable_custom_all_reduce
,
model_config
.
quantization
,
self
.
model_config
.
quantization
,
model_config
.
enforce_eager
,
self
.
model_config
.
enforce_eager
,
cache_config
.
cache_dtype
,
self
.
cache_config
.
cache_dtype
,
model_config
.
quantization_param_path
,
self
.
model_config
.
quantization_param_path
,
device_config
.
device
,
self
.
device_config
.
device
,
decoding_config
,
self
.
decoding_config
,
observability_config
,
self
.
observability_config
,
model_config
.
seed
,
self
.
model_config
.
seed
,
model_config
.
served_model_name
,
self
.
model_config
.
served_model_name
,
scheduler_config
.
num_scheduler_steps
,
self
.
scheduler_config
.
num_scheduler_steps
,
scheduler_config
.
chunked_prefill_enabled
,
self
.
scheduler_config
.
chunked_prefill_enabled
,
scheduler_config
.
multi_step_stream_outputs
,
self
.
scheduler_config
.
multi_step_stream_outputs
,
cache_config
.
enable_prefix_caching
,
self
.
cache_config
.
enable_prefix_caching
,
model_config
.
use_async_output_proc
,
self
.
model_config
.
use_async_output_proc
,
use_cached_outputs
,
use_cached_outputs
,
model_config
.
mm_processor_kwargs
,
self
.
model_config
.
mm_processor_kwargs
,
model_config
.
pooler_config
,
self
.
model_config
.
pooler_config
,
vllm_config
.
compilation_config
,
vllm_config
.
compilation_config
,
)
)
# TODO(woosuk): Print more configs in debug mode.
# TODO(woosuk): Print more configs in debug mode.
self
.
model_config
=
model_config
self
.
cache_config
=
cache_config
self
.
lora_config
=
lora_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
speculative_config
=
speculative_config
self
.
load_config
=
load_config
self
.
decoding_config
=
decoding_config
or
DecodingConfig
()
self
.
prompt_adapter_config
=
prompt_adapter_config
self
.
observability_config
=
observability_config
or
ObservabilityConfig
(
)
self
.
log_stats
=
log_stats
self
.
log_stats
=
log_stats
self
.
use_cached_outputs
=
use_cached_outputs
self
.
use_cached_outputs
=
use_cached_outputs
...
@@ -334,15 +322,15 @@ class LLMEngine:
...
@@ -334,15 +322,15 @@ class LLMEngine:
self
.
seq_counter
=
Counter
()
self
.
seq_counter
=
Counter
()
self
.
generation_config_fields
=
_load_generation_config_dict
(
self
.
generation_config_fields
=
_load_generation_config_dict
(
model_config
)
self
.
model_config
)
self
.
input_preprocessor
=
InputPreprocessor
(
model_config
,
self
.
input_preprocessor
=
InputPreprocessor
(
self
.
model_config
,
self
.
tokenizer
,
self
.
tokenizer
,
mm_registry
)
mm_registry
)
self
.
input_registry
=
input_registry
self
.
input_registry
=
input_registry
self
.
input_processor
=
input_registry
.
create_input_processor
(
self
.
input_processor
=
input_registry
.
create_input_processor
(
model_config
)
self
.
model_config
)
self
.
model_executor
=
executor_class
(
vllm_config
=
vllm_config
,
)
self
.
model_executor
=
executor_class
(
vllm_config
=
vllm_config
,
)
...
@@ -354,36 +342,36 @@ class LLMEngine:
...
@@ -354,36 +342,36 @@ class LLMEngine:
from
vllm.model_executor.model_loader
import
(
from
vllm.model_executor.model_loader
import
(
get_architecture_class_name
)
get_architecture_class_name
)
usage_message
.
report_usage
(
usage_message
.
report_usage
(
get_architecture_class_name
(
model_config
),
get_architecture_class_name
(
self
.
model_config
),
usage_context
,
usage_context
,
extra_kvs
=
{
extra_kvs
=
{
# Common configuration
# Common configuration
"dtype"
:
"dtype"
:
str
(
model_config
.
dtype
),
str
(
self
.
model_config
.
dtype
),
"tensor_parallel_size"
:
"tensor_parallel_size"
:
parallel_config
.
tensor_parallel_size
,
self
.
parallel_config
.
tensor_parallel_size
,
"block_size"
:
"block_size"
:
cache_config
.
block_size
,
self
.
cache_config
.
block_size
,
"gpu_memory_utilization"
:
"gpu_memory_utilization"
:
cache_config
.
gpu_memory_utilization
,
self
.
cache_config
.
gpu_memory_utilization
,
# Quantization
# Quantization
"quantization"
:
"quantization"
:
model_config
.
quantization
,
self
.
model_config
.
quantization
,
"kv_cache_dtype"
:
"kv_cache_dtype"
:
str
(
cache_config
.
cache_dtype
),
str
(
self
.
cache_config
.
cache_dtype
),
# Feature flags
# Feature flags
"enable_lora"
:
"enable_lora"
:
bool
(
lora_config
),
bool
(
self
.
lora_config
),
"enable_prompt_adapter"
:
"enable_prompt_adapter"
:
bool
(
prompt_adapter_config
),
bool
(
self
.
prompt_adapter_config
),
"enable_prefix_caching"
:
"enable_prefix_caching"
:
cache_config
.
enable_prefix_caching
,
self
.
cache_config
.
enable_prefix_caching
,
"enforce_eager"
:
"enforce_eager"
:
model_config
.
enforce_eager
,
self
.
model_config
.
enforce_eager
,
"disable_custom_all_reduce"
:
"disable_custom_all_reduce"
:
parallel_config
.
disable_custom_all_reduce
,
self
.
parallel_config
.
disable_custom_all_reduce
,
})
})
if
self
.
tokenizer
:
if
self
.
tokenizer
:
...
@@ -402,7 +390,7 @@ class LLMEngine:
...
@@ -402,7 +390,7 @@ class LLMEngine:
for
_
in
range
(
self
.
parallel_config
.
pipeline_parallel_size
)
for
_
in
range
(
self
.
parallel_config
.
pipeline_parallel_size
)
]
]
if
model_config
.
use_async_output_proc
:
if
self
.
model_config
.
use_async_output_proc
:
process_model_outputs
=
weak_bind
(
self
.
_process_model_outputs
)
process_model_outputs
=
weak_bind
(
self
.
_process_model_outputs
)
self
.
async_callbacks
=
[
self
.
async_callbacks
=
[
...
@@ -422,11 +410,11 @@ class LLMEngine:
...
@@ -422,11 +410,11 @@ class LLMEngine:
# GPU and CPU blocks, which are profiled in the distributed executor.
# GPU and CPU blocks, which are profiled in the distributed executor.
self
.
scheduler
=
[
self
.
scheduler
=
[
Scheduler
(
Scheduler
(
scheduler_config
,
cache_config
,
lora_config
,
self
.
scheduler_config
,
self
.
cache_config
,
self
.
lora_config
,
parallel_config
.
pipeline_parallel_size
,
self
.
parallel_config
.
pipeline_parallel_size
,
self
.
async_callbacks
[
v_id
]
self
.
async_callbacks
[
v_id
]
if
model_config
.
use_async_output_proc
else
None
)
if
self
.
model_config
.
use_async_output_proc
else
None
)
for
v_id
in
range
(
parallel_config
.
pipeline_parallel_size
)
for
v_id
in
range
(
self
.
parallel_config
.
pipeline_parallel_size
)
]
]
# Metric Logging.
# Metric Logging.
...
@@ -448,7 +436,8 @@ class LLMEngine:
...
@@ -448,7 +436,8 @@ class LLMEngine:
"prometheus"
:
"prometheus"
:
PrometheusStatLogger
(
PrometheusStatLogger
(
local_interval
=
_LOCAL_LOGGING_INTERVAL_SEC
,
local_interval
=
_LOCAL_LOGGING_INTERVAL_SEC
,
labels
=
dict
(
model_name
=
model_config
.
served_model_name
),
labels
=
dict
(
model_name
=
self
.
model_config
.
served_model_name
),
max_model_len
=
self
.
model_config
.
max_model_len
),
max_model_len
=
self
.
model_config
.
max_model_len
),
}
}
self
.
stat_loggers
[
"prometheus"
].
info
(
"cache_config"
,
self
.
stat_loggers
[
"prometheus"
].
info
(
"cache_config"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment