Change the name to vLLM (#150)

0b98ba15 · Woosuk Kwon · GitHub · e5464ee4 · 0b98ba15 · 0b98ba15
Unverified Commit 0b98ba15 authored Jun 17, 2023 by Woosuk Kwon Committed by GitHub Jun 17, 2023
20 changed files
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -17,9 +17,9 @@

 # -- Project information -----------------------------------------------------

-project = 'CacheFlow'
-copyright = '2023, CacheFlow Team'
-author = 'the CacheFlow Team'
+project = 'vLLM'
+copyright = '2023, vLLM Team'
+author = 'the vLLM Team'


 # -- General configuration ---------------------------------------------------
@@ -55,7 +55,7 @@ html_title = project
 html_theme = 'sphinx_book_theme'
 html_theme_options = {
    'path_to_docs': 'docs/source',
-    'repository_url': 'https://github.com/WoosukKwon/cacheflow',
+    'repository_url': 'https://github.com/WoosukKwon/vllm',
    'use_repository_button': True,
 }


--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
 Installation
 ============

-CacheFlow is a Python library that includes some C++ and CUDA code.
-CacheFlow can run on systems that meet the following requirements:
+vLLM is a Python library that includes some C++ and CUDA code.
+vLLM can run on systems that meet the following requirements:

 * OS: Linux
 * Python: 3.8 or higher
@@ -10,23 +10,23 @@ CacheFlow can run on systems that meet the following requirements:
 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, etc.)

 .. note::
-    As of now, CacheFlow does not support CUDA 12.
+    As of now, vLLM does not support CUDA 12.
    If you are using Hopper or Lovelace GPUs, please use CUDA 11.8.

 .. tip::
-    If you have trouble installing CacheFlow, we recommend using the NVIDIA PyTorch Docker image.
+    If you have trouble installing vLLM, we recommend using the NVIDIA PyTorch Docker image.

    .. code-block:: console

        $ # Pull the Docker image with CUDA 11.8.
        $ docker run --gpus all -it --rm --shm-size=8g nvcr.io/nvidia/pytorch:22.12-py3

-    Inside the Docker container, please execute :code:`pip uninstall torch` before installing CacheFlow.
+    Inside the Docker container, please execute :code:`pip uninstall torch` before installing vLLM.

 Install with pip
 ----------------

-You can install CacheFlow using pip:
+You can install vLLM using pip:

 .. code-block:: console

@@ -34,8 +34,8 @@ You can install CacheFlow using pip:
    $ conda create -n myenv python=3.8 -y
    $ conda activate myenv

-    $ # Install CacheFlow.
-    $ pip install cacheflow  # This may take 5-10 minutes.
+    $ # Install vLLM.
+    $ pip install vllm  # This may take 5-10 minutes.


 .. _build_from_source:
@@ -43,10 +43,10 @@ You can install CacheFlow using pip:
 Build from source
 -----------------

-You can also build and install CacheFlow from source.
+You can also build and install vLLM from source.

 .. code-block:: console

-    $ git clone https://github.com/WoosukKwon/cacheflow.git
-    $ cd cacheflow
+    $ git clone https://github.com/WoosukKwon/vllm.git
+    $ cd vllm
    $ pip install -e .  # This may take 5-10 minutes.
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -8,7 +8,7 @@ Placeholder.

 .. code-block:: python

-    from cacheflow import LLM, SamplingParams
+    from vllm import LLM, SamplingParams

    # Sample prompts.
    prompts = [

--- a/docs/source/index.rst
+++ b/docs/source/index.rst
-Welcome to CacheFlow!
-=====================
+Welcome to vLLM!
+================

 Documentation
 -------------

--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -3,30 +3,30 @@
 Adding a New Model
 ==================

-This document provides a high-level guide on integrating a `HuggingFace Transformers <https://github.com/huggingface/transformers>`_ model into CacheFlow.
+This document provides a high-level guide on integrating a `HuggingFace Transformers <https://github.com/huggingface/transformers>`_ model into vLLM.

 .. note::
    The complexity of adding a new model depends heavily on the model's architecture.
-    The process is considerably straightforward if the model shares a similar architecture with an existing model in CacheFlow.
+    The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
    However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.

 .. tip::
-    If you are encountering issues while integrating your model into CacheFlow, feel free to open an issue on our `GitHub <https://github.com/WoosukKwon/cacheflow/issues>`_ repository.
+    If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/WoosukKwon/vllm/issues>`_ repository.
    We will be happy to help you out!


-0. Fork the CacheFlow repository
+0. Fork the vLLM repository
 --------------------------------

-Start by forking our `GitHub <https://github.com/WoosukKwon/cacheflow/issues>`_ repository and then :ref:`build it from source <build_from_source>`.
+Start by forking our `GitHub <https://github.com/WoosukKwon/vllm/issues>`_ repository and then :ref:`build it from source <build_from_source>`.
 This gives you the ability to modify the codebase and test your model.


 1. Bring your model code
 ------------------------

-Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `cacheflow/model_executor/models <https://github.com/WoosukKwon/cacheflow/tree/main/cacheflow/model_executor/models>`_ directory.
-For instance, CacheFlow's `OPT model <https://github.com/WoosukKwon/cacheflow/blob/main/cacheflow/model_executor/models/opt.py>`_ was adpated from the HuggingFace's `modeling_opt.py <https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py>`_ file.
+Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `vllm/model_executor/models <https://github.com/WoosukKwon/vllm/tree/main/vllm/model_executor/models>`_ directory.
+For instance, vLLM's `OPT model <https://github.com/WoosukKwon/vllm/blob/main/vllm/model_executor/models/opt.py>`_ was adpated from the HuggingFace's `modeling_opt.py <https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py>`_ file.

 .. warning::
    When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
@@ -62,11 +62,11 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
    +) -> Dict[int, SequenceOutputs]:

 3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
-4. Replace the attention operation with either :code:`GPTCacheFlowAttention` or :code:`GPTNeoXCacheFlowAttention`, depending on the model's architecture.
+4. Replace the attention operation with either :code:`GPTPagedAttention` or :code:`GPTNeoXPagedAttention`, depending on the model's architecture.

 .. note::
-    Currently, CacheFlow supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
-    If your model employs a different attention mechanism, you will need to implement a new attention layer in CacheFlow.
+    Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
+    If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.


 3. (Optional) Implement tensor parallelism support
@@ -91,4 +91,4 @@ While the process is straightforward for most layers, the tensor-parallel layers
 5. Register your model
 ----------------------

-Finally, include your :code:`*ForCausalLM` class in `cacheflow/model_executor/models/__init__.py <https://github.com/WoosukKwon/cacheflow/blob/main/cacheflow/model_executor/models/__init__.py>`_ and register it to the :code:`_MODEL_REGISTRY` in `cacheflow/model_executor/model_loader.py <https://github.com/WoosukKwon/cacheflow/blob/main/cacheflow/model_executor/model_loader.py>`_.
+Finally, include your :code:`*ForCausalLM` class in `vllm/model_executor/models/__init__.py <https://github.com/WoosukKwon/vllm/blob/main/vllm/model_executor/models/__init__.py>`_ and register it to the :code:`_MODEL_REGISTRY` in `vllm/model_executor/model_loader.py <https://github.com/WoosukKwon/vllm/blob/main/vllm/model_executor/model_loader.py>`_.
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -3,8 +3,8 @@
 Supported Models
 ================

-CacheFlow supports a variety of generative Transformer models in `HuggingFace Transformers <https://github.com/huggingface/transformers>`_.
-The following is the list of model architectures that are currently supported by CacheFlow.
+vLLM supports a variety of generative Transformer models in `HuggingFace Transformers <https://github.com/huggingface/transformers>`_.
+The following is the list of model architectures that are currently supported by vLLM.
 Alongside each architecture, we include some popular models that use it.

 .. list-table::
@@ -22,19 +22,19 @@ Alongside each architecture, we include some popular models that use it.
  * - :code:`OPTForCausalLM`
    - OPT, OPT-IML

-If your model uses one of the above model architectures, you can seamlessly run your model with CacheFlow.
+If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
-Alternatively, you can raise an issue on our `GitHub <https://github.com/WoosukKwon/cacheflow/issues>`_ project.
+Alternatively, you can raise an issue on our `GitHub <https://github.com/WoosukKwon/vllm/issues>`_ project.

 .. tip::
    The easiest way to check if your model is supported is to run the program below:

    .. code-block:: python

-        from cacheflow import LLM
+        from vllm import LLM

        llm = LLM(model=...)  # Name or path of your model
        output = llm.generate("Hello, my name is")
        print(output)

-    If CacheFlow successfully generates text, it indicates that your model is supported.
+    If vLLM successfully generates text, it indicates that your model is supported.
--- a/examples/api_client.py
+++ b/examples/api_client.py
-"""Example Python client for cacheflow.entrypoints.api_server"""
+"""Example Python client for vllm.entrypoints.api_server"""

 import argparse
 import json

--- a/examples/gradio_webserver.py
+++ b/examples/gradio_webserver.py
@@ -6,7 +6,7 @@ import requests


 def http_bot(prompt):
-    headers = {"User-Agent": "Cacheflow Client"}
+    headers = {"User-Agent": "vLLM Client"}
    pload = {
        "prompt": prompt,
        "stream": True,
@@ -24,7 +24,7 @@ def http_bot(prompt):
 def build_demo():
    with gr.Blocks() as demo:
        gr.Markdown(
-            "# Cacheflow text completion demo\n"
+            "# vLLM text completion demo\n"
        )
        inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")
        outputbox = gr.Textbox(label="Output", placeholder="Generated result from the model")

--- a/examples/llm_engine_example.py
+++ b/examples/llm_engine_example.py
 import argparse

-from cacheflow import EngineArgs, LLMEngine, SamplingParams
+from vllm import EngineArgs, LLMEngine, SamplingParams


 def main(args: argparse.Namespace):

--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
-from cacheflow import LLM, SamplingParams
+from vllm import LLM, SamplingParams


 # Sample prompts.

--- a/examples/openai_client.py
+++ b/examples/openai_client.py
 import openai

-# Modify OpenAI's API key and API base to use CacheFlow's API server.
+# Modify OpenAI's API key and API base to use vLLM's API server.
 openai.api_key = "EMPTY"
 openai.api_base = "http://localhost:8000/v1"
 model = "facebook/opt-125m"

--- a/mypy.ini
+++ b/mypy.ini
@@ -3,6 +3,6 @@ python_version = 3.8

 ignore_missing_imports = True

-files = cacheflow
+files = vllm
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
-exclude = cacheflow/model_executor/parallel_utils/|cacheflow/model_executor/models/
+exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/
--- a/setup.py
+++ b/setup.py
@@ -75,7 +75,7 @@ ext_modules = []

 # Cache operations.
 cache_extension = CUDAExtension(
-    name="cacheflow.cache_ops",
+    name="vllm.cache_ops",
    sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
 )
@@ -83,7 +83,7 @@ ext_modules.append(cache_extension)

 # Attention kernels.
 attention_extension = CUDAExtension(
-    name="cacheflow.attention_ops",
+    name="vllm.attention_ops",
    sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
 )
@@ -91,7 +91,7 @@ ext_modules.append(attention_extension)

 # Positional encoding kernels.
 positional_encoding_extension = CUDAExtension(
-    name="cacheflow.pos_encoding_ops",
+    name="vllm.pos_encoding_ops",
    sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
 )
@@ -99,7 +99,7 @@ ext_modules.append(positional_encoding_extension)

 # Layer normalization kernels.
 layernorm_extension = CUDAExtension(
-    name="cacheflow.layernorm_ops",
+    name="vllm.layernorm_ops",
    sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
 )
@@ -107,7 +107,7 @@ ext_modules.append(layernorm_extension)

 # Activation kernels.
 activation_extension = CUDAExtension(
-    name="cacheflow.activation_ops",
+    name="vllm.activation_ops",
    sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
    extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS},
 )
@@ -144,18 +144,18 @@ def get_requirements() -> List[str]:


 setuptools.setup(
-    name="cacheflow",
-    version=find_version(get_path("cacheflow", "__init__.py")),
-    author="CacheFlow Team",
-    author_email="cacheflow@gmail.com",
+    name="vllm",
+    version=find_version(get_path("vllm", "__init__.py")),
+    author="vLLM Team",
+    author_email="vllm@gmail.com",  # FIXME
    license="Apache 2.0",
-    description="CacheFlow: A high-performance LLM Serving System",
+    description="vLLM: Easy, Fast, and Cheap LLM Serving with PagedAttention",  # FIXME
    long_description=read_readme(),
    long_description_content_type="text/markdown",
-    url="https://github.com/WoosukKwon/cacheflow",
+    url="https://github.com/WoosukKwon/vllm",
    project_urls={
-        "Homepage": "https://github.com/WoosukKwon/cacheflow",
-        "Documentation": "https://cacheflow.readthedocs.io/en/latest/",
+        "Homepage": "https://github.com/WoosukKwon/vllm",
+        "Documentation": "https://vllm.readthedocs.io/en/latest/",  # FIXME
    },
    classifiers=[
        "Programming Language :: Python :: 3.8",

--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
 import torch
 import torch.nn.functional as F

-from cacheflow import activation_ops
+from vllm import activation_ops


 def ref_silu_and_mul(x: torch.Tensor) -> torch.Tensor:

--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -5,7 +5,7 @@ import torch
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask

-from cacheflow import attention_ops
+from vllm import attention_ops

 MAX_SEQ_LEN = 4096
 TEST_SEED = 0

--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -2,7 +2,7 @@ import random

 import torch

-from cacheflow import cache_ops
+from vllm import cache_ops


 @torch.inference_mode()

--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
 import torch
 import torch.nn as nn

-from cacheflow import layernorm_ops
+from vllm import layernorm_ops


 class RefRMSNorm(nn.Module):

--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -4,7 +4,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

-from cacheflow import pos_encoding_ops
+from vllm import pos_encoding_ops


 def rotate_half(x: torch.Tensor) -> torch.Tensor:

--- a/cacheflow/__init__.py
+++ b/cacheflow/__init__.py
-from cacheflow.engine.arg_utils import EngineArgs
-from cacheflow.engine.llm_engine import LLMEngine
-from cacheflow.engine.ray_utils import initialize_cluster
-from cacheflow.entrypoints.llm import LLM
-from cacheflow.outputs import CompletionOutput, RequestOutput
-from cacheflow.sampling_params import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.llm_engine import LLMEngine
+from vllm.engine.ray_utils import initialize_cluster
+from vllm.entrypoints.llm import LLM
+from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.sampling_params import SamplingParams

 __version__ = "0.1.0"

@@ -14,5 +15,7 @@ __all__ = [
    "CompletionOutput",
    "LLMEngine",
    "EngineArgs",
+    "AsyncLLMEngine",
+    "AsyncEngineArgs",
    "initialize_cluster",
 ]
--- a/cacheflow/block.py
+++ b/cacheflow/block.py
 """Token blocks."""
 from typing import List

-from cacheflow.utils import Device
+from vllm.utils import Device

 _BLANK_TOKEN_ID = -1