"""Config for the tokenizer pool. If None, will use synchronous
tokenization."""
# Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
ray_workers_use_nsight:bool=False
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
# ray distributed model workers placement group.
placement_group:Optional["PlacementGroup"]=None
"""ray distributed model workers placement group."""
# Backend to use for distributed model
# workers, either "ray" or "mp" (multiprocessing). If the product
# of pipeline_parallel_size and tensor_parallel_size is less than
# or equal to the number of GPUs available, "mp" will be used to
# keep processing on a single host. Otherwise, this will default
# to "ray" if Ray is installed and fail otherwise. Note that tpu
# and hpu only support Ray for distributed inference.
distributed_executor_backend:Optional[Union[str,
type["ExecutorBase"]]]=None
"""Backend to use for distributed model
workers, either "ray" or "mp" (multiprocessing). If the product
of pipeline_parallel_size and tensor_parallel_size is less than
or equal to the number of GPUs available, "mp" will be used to
keep processing on a single host. Otherwise, this will default
to "ray" if Ray is installed and fail otherwise. Note that tpu
and hpu only support Ray for distributed inference."""
# the full name of the worker class to use. If "auto", the worker class
# will be determined based on the platform.
worker_cls:str="auto"
"""The full name of the worker class to use. If "auto", the worker class
will be determined based on the platform."""
sd_worker_cls:str="auto"
"""The full name of the worker class to use for speculative decofing.
If "auto", the worker class will be determined based on the platform."""
worker_extension_cls:str=""
"""The full name of the worker extension class to use. The worker extension
class is dynamically inherited by the worker class. This is used to inject
new attributes and methods to the worker class for use in collective_rpc
calls."""
# world_size is TPxPP, it affects the number of workers we create.
world_size:int=field(init=False)
# world_size_across_dp is TPxPPxDP, it is the size of the world
# including data parallelism.
"""world_size is TPxPP, it affects the number of workers we create."""
world_size_across_dp:int=field(init=False)
"""world_size_across_dp is TPxPPxDP, it is the size of the world
including data parallelism."""
rank:int=0
"""Global rank in distributed setup."""
defget_next_dp_init_port(self)->int:
"""
...
...
@@ -1717,6 +1830,14 @@ class SchedulerConfig:
chunked_prefill_enabled:bool=field(init=False)
# If set to true and chunked prefill is enabled, we do not want to
# partially schedule a multimodal item. Only used in V1
# This ensures that if a request has a mixed prompt
# (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
# some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
# it will be scheduled as TTTT in one step and IIIIIIIIII in the next.
disable_chunked_mm_input:bool=False
# scheduler class or path. "vllm.core.scheduler.Scheduler" (default)