[Core] Make scheduling policy settable via EngineArgs (#8956)

be76e5aa · Sebastian Schoennenbeck · GitHub · 2ae25f79 · be76e5aa
Unverified Commit be76e5aa authored Sep 30, 2024 by Sebastian Schoennenbeck Committed by GitHub Sep 30, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 2 deletions

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +14 -2

No files found.
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -2,8 +2,8 @@ import argparse
 import dataclasses
 import json
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
-                    Type, Union)
+from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional,
+                    Tuple, Type, Union)

 import torch

@@ -177,6 +177,7 @@ class EngineArgs:
    disable_async_output_proc: bool = False
    override_neuron_config: Optional[Dict[str, Any]] = None
    mm_processor_kwargs: Optional[Dict[str, Any]] = None
+    scheduling_policy: Literal["fcfs", "priority"] = "fcfs"

    def __post_init__(self):
        if self.tokenizer is None:
@@ -797,6 +798,16 @@ class EngineArgs:
            default=None,
            help="override or set neuron device configuration.")

+        parser.add_argument(
+            '--scheduling-policy',
+            choices=['fcfs', 'priority'],
+            default="fcfs",
+            help='The scheduling policy to use. "fcfs" (first come first served'
+            ', i.e. requests are handled in order of arrival; default) '
+            'or "priority" (requests are handled based on given '
+            'priority (lower value means earlier handling) and time of '
+            'arrival deciding any ties).')
+
        return parser

    @classmethod
@@ -1011,6 +1022,7 @@ class EngineArgs:
            multi_step_stream_outputs=self.multi_step_stream_outputs,
            send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                             and parallel_config.use_ray),
+            policy=self.scheduling_policy,
        )
        lora_config = LoRAConfig(
            max_lora_rank=self.max_lora_rank,