"""vLLM: a high-throughput and memory-efficient inference engine for LLMs""" # Adapted from https://github.com/ray-project/ray/blob/f92928c9cfcbbf80c3a8534ca4911de1b44069c0/python/ray/__init__.py#L11 def _configure_system(): import os import sys # Importing flash-attn. thirdparty_files = os.path.join(os.path.abspath(os.path.dirname(__file__)), "thirdparty_files") sys.path.insert(0, thirdparty_files) _configure_system() # Delete configuration function. del _configure_system from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402 from vllm.engine.async_llm_engine import AsyncLLMEngine # noqa: E402 from vllm.engine.llm_engine import LLMEngine # noqa: E402 from vllm.engine.ray_utils import initialize_cluster # noqa: E402 from vllm.entrypoints.llm import LLM # noqa: E402 from vllm.outputs import CompletionOutput, RequestOutput # noqa: E402 from vllm.sampling_params import SamplingParams # noqa: E402 __version__ = "0.3.3" __all__ = [ "LLM", "SamplingParams", "RequestOutput", "CompletionOutput", "LLMEngine", "EngineArgs", "AsyncLLMEngine", "AsyncEngineArgs", "initialize_cluster", ]