fix run error

9b902f9e · zhuwenwen · a48d654d · 9b902f9e · 9b902f9e · 9b902f9e
Commit 9b902f9e authored Sep 11, 2024 by zhuwenwen
5 changed files
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -202,12 +202,12 @@ def which_attn_to_use(
        # AMD GPUs.
        selected_backend = (_Backend.ROCM_FLASH if selected_backend
                            == _Backend.FLASH_ATTN else selected_backend)
-        if selected_backend == _Backend.ROCM_FLASH:
-            if current_platform.get_device_capability()[0] != 9:
-                # not Instinct series GPUs.
-                logger.info("flash_attn is not supported on NAVI GPUs.")
-        else:
-            logger.info("%s is not supported in AMD GPUs.", selected_backend)
+        # if selected_backend == _Backend.ROCM_FLASH:
+        #     if current_platform.get_device_capability()[0] != 9:
+        #         # not Instinct series GPUs.
+        #         logger.info("flash_attn is not supported on NAVI GPUs.")
+        # else:
+        #     logger.info("%s is not supported in AMD GPUs.", selected_backend)
        return _Backend.ROCM_FLASH

    # FlashAttn in NVIDIA GPUs.

--- a/vllm/benchmark_throughput.py
+++ b/vllm/benchmark_throughput.py
@@ -7,14 +7,16 @@ from typing import List, Optional, Tuple

 import numpy as np
 import torch
+import uvloop
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)
-
-from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptInputs
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators


 def sample_requests(
@@ -85,8 +87,11 @@ def run_vllm(
    max_num_batched_tokens: int,
    distributed_executor_backend: Optional[str],
    gpu_memory_utilization: float = 0.9,
+    num_scheduler_steps: int = 1,
+    use_v2_block_manager: bool = False,
    download_dir: Optional[str] = None,
    load_format: str = EngineArgs.load_format,
+    disable_async_output_proc: bool = False,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(
@@ -109,6 +114,9 @@ def run_vllm(
        max_num_batched_tokens=max_num_batched_tokens,
        distributed_executor_backend=distributed_executor_backend,
        load_format=load_format,
+        num_scheduler_steps=num_scheduler_steps,
+        use_v2_block_manager=use_v2_block_manager,
+        disable_async_output_proc=disable_async_output_proc,
    )

    # Add the requests to the engine.
@@ -167,6 +175,93 @@ def run_vllm(
    return end - start


+async def run_vllm_async(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+    kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
+    device: str,
+    enable_prefix_caching: bool,
+    enable_chunked_prefill: bool,
+    max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
+    gpu_memory_utilization: float = 0.9,
+    num_scheduler_steps: int = 1,
+    use_v2_block_manager: bool = False,
+    download_dir: Optional[str] = None,
+    load_format: str = EngineArgs.load_format,
+    disable_async_output_proc: bool = False,
+    disable_frontend_multiprocessing: bool = False,
+) -> float:
+    from vllm import SamplingParams
+    engine_args = AsyncEngineArgs(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        quantization_param_path=quantization_param_path,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        download_dir=download_dir,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
+        load_format=load_format,
+        num_scheduler_steps=num_scheduler_steps,
+        use_v2_block_manager=use_v2_block_manager,
+        disable_async_output_proc=disable_async_output_proc,
+        worker_use_ray=False,
+        engine_use_ray=False,
+        disable_log_requests=True,
+    )
+
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+
+        # Add the requests to the engine.
+        prompts: List[str] = []
+        sampling_params: List[SamplingParams] = []
+        for prompt, _, output_len in requests:
+            prompts.append(prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=0.0 if use_beam_search else 1.0,
+                    top_p=1.0,
+                    use_beam_search=use_beam_search,
+                    ignore_eos=True,
+                    max_tokens=output_len,
+                ))
+
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
+        end = time.perf_counter()
+        return end - start
+
+
 def run_hf(
    requests: List[Tuple[str, int, int]],
    model: str,
@@ -266,15 +361,24 @@ def main(args: argparse.Namespace):
                                   args.output_len)

    if args.backend == "vllm":
-        elapsed_time = run_vllm(
-            warmup_requests, requests, args.model, args.tokenizer, args.quantization,
+        run_args = [
+            requests, args.model, args.tokenizer, args.quantization,
            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
            args.trust_remote_code, args.dtype, args.max_model_len,
            args.enforce_eager, args.kv_cache_dtype,
            args.quantization_param_path, args.device,
            args.enable_prefix_caching, args.enable_chunked_prefill,
            args.max_num_batched_tokens, args.distributed_executor_backend,
-            args.gpu_memory_utilization, args.download_dir, args.load_format)
+            args.gpu_memory_utilization, args.num_scheduler_steps,
+            args.use_v2_block_manager, args.download_dir, args.load_format,
+            args.disable_async_output_proc
+        ]
+
+        if args.async_engine:
+            run_args.append(args.disable_frontend_multiprocessing)
+            elapsed_time = uvloop.run(run_vllm_async(*run_args))
+        else:
+            elapsed_time = run_vllm(*run_args)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -407,10 +511,18 @@ if __name__ == "__main__":
        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
        'CPU.')
+    parser.add_argument(
+        "--num-scheduler-steps",
+        type=int,
+        default=1,
+        help="Maximum number of forward steps per scheduler call.")
+    parser.add_argument("--use-v2-block-manager",
+                        action='store_true',
+                        help="Enable block manager v2.")
    parser.add_argument(
        "--enable-prefix-caching",
        action='store_true',
-        help="enable automatic prefix caching for vLLM backend.")
+        help="Enable automatic prefix caching for vLLM backend.")
    parser.add_argument("--enable-chunked-prefill",
                        action='store_true',
                        help="enable chunked prefill for vLLM backend.")
@@ -459,6 +571,19 @@ if __name__ == "__main__":
        'section for more information.\n'
        '* "bitsandbytes" will load the weights using bitsandbytes '
        'quantization.\n')
+    parser.add_argument(
+        "--disable-async-output-proc",
+        action='store_true',
+        default=False,
+        help="Disable async output processor for vLLM backend.")
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -315,128 +315,136 @@ class LLMEngine:
            observability_config=self.observability_config,
        )

-        if not self.model_config.embedding_mode:
-            self._initialize_kv_caches()
-
-        # If usage stat is enabled, collect relevant info.
-        if is_usage_stats_enabled():
-            from vllm.model_executor.model_loader import (
-                get_architecture_class_name)
-            usage_message.report_usage(
-                get_architecture_class_name(model_config),
-                usage_context,
-                extra_kvs={
-                    # Common configuration
-                    "dtype":
-                    str(model_config.dtype),
-                    "tensor_parallel_size":
-                    parallel_config.tensor_parallel_size,
-                    "block_size":
-                    cache_config.block_size,
-                    "gpu_memory_utilization":
-                    cache_config.gpu_memory_utilization,
-
-                    # Quantization
-                    "quantization":
-                    model_config.quantization,
-                    "kv_cache_dtype":
-                    str(cache_config.cache_dtype),
-
-                    # Feature flags
-                    "enable_lora":
-                    bool(lora_config),
-                    "enable_prompt_adapter":
-                    bool(prompt_adapter_config),
-                    "enable_prefix_caching":
-                    cache_config.enable_prefix_caching,
-                    "enforce_eager":
-                    model_config.enforce_eager,
-                    "disable_custom_all_reduce":
-                    parallel_config.disable_custom_all_reduce,
-                })
-
-        if self.tokenizer:
-            # Ping the tokenizer to ensure liveness if it runs in a
-            # different process.
-            self.tokenizer.ping()
-
-        self.cached_scheduler_outputs = [
-            SchedulerOutputState()
-            for _ in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-        self.scheduler_contexts = [
-            SchedulerContext()
-            for _ in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-        self.async_callbacks = [
-            functools.partial(self._process_model_outputs,
-                              ctx=self.scheduler_contexts[v_id])
-            for v_id in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-        # Currently used by AsyncLLMEngine to ensure quick append
-        # of request outputs to asyncio queues
-        self.process_request_outputs_callback = None
-
-        # Create the scheduler.
-        # NOTE: the cache_config here have been updated with the numbers of
-        # GPU and CPU blocks, which are profiled in the distributed executor.
-        self.scheduler = [
-            Scheduler(
-                scheduler_config, cache_config, lora_config,
-                parallel_config.pipeline_parallel_size,
-                self.async_callbacks[v_id]
-                if model_config.use_async_output_proc else None)
-            for v_id in range(parallel_config.pipeline_parallel_size)
-        ]
-
-        # Metric Logging.
-        if self.log_stats:
-            if stat_loggers is not None:
-                self.stat_loggers = stat_loggers
-            else:
-                # Lazy import for prometheus multiprocessing.
-                # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
-                # before prometheus_client is imported.
-                # See https://prometheus.github.io/client_python/multiprocess/
-                from vllm.engine.metrics import (LoggingStatLogger,
-                                                 PrometheusStatLogger)
-
-                self.stat_loggers = {
-                    "logging":
-                    LoggingStatLogger(
-                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC),
-                    "prometheus":
-                    PrometheusStatLogger(
-                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
-                        labels=dict(model_name=model_config.served_model_name),
-                        max_model_len=self.model_config.max_model_len),
-                }
-                self.stat_loggers["prometheus"].info("cache_config",
-                                                     self.cache_config)
-
-        self.tracer = None
-        if self.observability_config.otlp_traces_endpoint:
-            self.tracer = init_tracer(
-                "vllm.llm_engine",
-                self.observability_config.otlp_traces_endpoint)
-
-        # Create sequence output processor, e.g. for beam search or
-        # speculative decoding.
-        self.output_processor = (
-            SequenceGroupOutputProcessor.create_output_processor(
-                self.scheduler_config,
-                self.detokenizer,
-                self.scheduler,
-                self.seq_counter,
-                get_tokenizer_for_seq,
-                stop_checker=StopChecker(
-                    self.scheduler_config.max_model_len,
+        init_success = False
+        try:
+            if not self.model_config.embedding_mode:
+                self._initialize_kv_caches()
+
+            # If usage stat is enabled, collect relevant info.
+            if is_usage_stats_enabled():
+                from vllm.model_executor.model_loader import (
+                    get_architecture_class_name)
+                usage_message.report_usage(
+                    get_architecture_class_name(model_config),
+                    usage_context,
+                    extra_kvs={
+                        # Common configuration
+                        "dtype":
+                        str(model_config.dtype),
+                        "tensor_parallel_size":
+                        parallel_config.tensor_parallel_size,
+                        "block_size":
+                        cache_config.block_size,
+                        "gpu_memory_utilization":
+                        cache_config.gpu_memory_utilization,
+
+                        # Quantization
+                        "quantization":
+                        model_config.quantization,
+                        "kv_cache_dtype":
+                        str(cache_config.cache_dtype),
+
+                        # Feature flags
+                        "enable_lora":
+                        bool(lora_config),
+                        "enable_prompt_adapter":
+                        bool(prompt_adapter_config),
+                        "enable_prefix_caching":
+                        cache_config.enable_prefix_caching,
+                        "enforce_eager":
+                        model_config.enforce_eager,
+                        "disable_custom_all_reduce":
+                        parallel_config.disable_custom_all_reduce,
+                    })
+
+            if self.tokenizer:
+                # Ping the tokenizer to ensure liveness if it runs in a
+                # different process.
+                self.tokenizer.ping()
+
+            self.cached_scheduler_outputs = [
+                SchedulerOutputState()
+                for _ in range(self.parallel_config.pipeline_parallel_size)
+            ]
+
+            self.scheduler_contexts = [
+                SchedulerContext()
+                for _ in range(self.parallel_config.pipeline_parallel_size)
+            ]
+
+            self.async_callbacks = [
+                functools.partial(self._process_model_outputs,
+                                ctx=self.scheduler_contexts[v_id])
+                for v_id in range(self.parallel_config.pipeline_parallel_size)
+            ]
+
+            # Currently used by AsyncLLMEngine to ensure quick append
+            # of request outputs to asyncio queues
+            self.process_request_outputs_callback = None
+
+            # Create the scheduler.
+            # NOTE: the cache_config here have been updated with the numbers of
+            # GPU and CPU blocks, which are profiled in the distributed executor.
+            self.scheduler = [
+                Scheduler(
+                    scheduler_config, cache_config, lora_config,
+                    parallel_config.pipeline_parallel_size,
+                    self.async_callbacks[v_id]
+                    if model_config.use_async_output_proc else None)
+                for v_id in range(parallel_config.pipeline_parallel_size)
+            ]
+
+            # Metric Logging.
+            if self.log_stats:
+                if stat_loggers is not None:
+                    self.stat_loggers = stat_loggers
+                else:
+                    # Lazy import for prometheus multiprocessing.
+                    # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
+                    # before prometheus_client is imported.
+                    # See https://prometheus.github.io/client_python/multiprocess/
+                    from vllm.engine.metrics import (LoggingStatLogger,
+                                                    PrometheusStatLogger)
+
+                    self.stat_loggers = {
+                        "logging":
+                        LoggingStatLogger(
+                            local_interval=_LOCAL_LOGGING_INTERVAL_SEC),
+                        "prometheus":
+                        PrometheusStatLogger(
+                            local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
+                            labels=dict(model_name=model_config.served_model_name),
+                            max_model_len=self.model_config.max_model_len),
+                    }
+                    self.stat_loggers["prometheus"].info("cache_config",
+                                                        self.cache_config)
+
+            self.tracer = None
+            if self.observability_config.otlp_traces_endpoint:
+                self.tracer = init_tracer(
+                    "vllm.llm_engine",
+                    self.observability_config.otlp_traces_endpoint)
+    
+            # Create sequence output processor, e.g. for beam search or
+            # speculative decoding.
+            self.output_processor = (
+                SequenceGroupOutputProcessor.create_output_processor(
+                    self.scheduler_config,
+                    self.detokenizer,
+                    self.scheduler,
+                    self.seq_counter,
                    get_tokenizer_for_seq,
-                ),
-            ))
+                    stop_checker=StopChecker(
+                        self.scheduler_config.max_model_len,
+                        get_tokenizer_for_seq,
+                    ),
+                ))
+            init_success = True
+        finally:
+            if not init_success:
+                # Ensure that model_executor is shut down if LLMEngine init
+                # failed
+                self.model_executor.shutdown()

    def _initialize_kv_caches(self) -> None:
        """Initialize the KV cache in the worker(s).

--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -131,8 +131,7 @@ class WorkerMonitor(threading.Thread):
                    if process.exitcode is not None and process.exitcode != 0:
                        died_count += 1
                        logger.error("Worker %s pid %s died, exit code: %s",
-                                     process.name, process.pid,
-                                     process.exitcode)
+                                     process.name, process.pid, process.exitcode)
                if died_count < len(self.workers):
                    logger.info(
                        "Killing remaining local vLLM worker processes")

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1106,12 +1106,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                batch_size += seq_len

                seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
-                    .dummy_data_for_profiling(model_config, seq_len)
-
-                # Having more tokens is over-conservative but otherwise fine
-                assert len(seq_data.prompt_token_ids) >= seq_len, (
-                    f"Expected at least {seq_len} dummy tokens for profiling, "
-                    f"but got: {len(seq_data.prompt_token_ids)}")
+                    .dummy_data_for_profiling(self.model_config, 
+                                              seq_len, 
+                                              self.mm_registry)

                seq = SequenceGroupMetadata(
                    request_id=str(group_id),