Commit ead94d93 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/main'

parents fcffb7c8 f780504d
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
set -ex
# cd into parent directory of this file
cd "$(dirname "${BASH_SOURCE[0]}")/.."
# run benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
# write the results into a markdown file
echo "### Latency Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_latency.txt >> benchmark_results.md
echo "" >> benchmark_results.md
sed -n '$p' benchmark_latency.txt >> benchmark_results.md
echo "### Throughput Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_throughput.txt >> benchmark_results.md
echo "" >> benchmark_results.md
sed -n '$p' benchmark_throughput.txt >> benchmark_results.md
# upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template.j2` to generate
# the final pipeline yaml file.
steps:
- label: Regression Test
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: AsyncEngine Test
command: pytest -v -s async_engine
- label: Distributed Test
command: pytest -v -s test_comm_ops.py
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now.
- label: Engine Test
command: pytest -v -s engine
- label: Kernels Test
command: pytest -v -s kernels
soft_fail: true
- label: Models Test
commands:
- pytest -v -s models --forked
soft_fail: true
- label: Samplers Test
command: pytest -v -s samplers --forked
- label: Worker Test
command: pytest -v -s worker
- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
commands:
- pip install aiohttp
- bash run-benchmarks.sh
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
{% set default_num_gpu = 1 %}
{% set default_working_dir = "/vllm-workspace/tests" %}
steps:
- label: ":docker: build image"
commands:
- "docker build --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
DOCKER_BUILDKIT: "1"
- wait
{% for step in steps %}
- label: "{{ step.label }}"
agents:
queue: kubernetes
soft_fail: {{ step.soft_fail or false }}
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
plugins:
- kubernetes:
podSpec:
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- image: "{{ docker_image }}"
command: ["bash"]
args:
- "-c"
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
resources:
requests:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
limits:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
volumeMounts:
- mountPath: /dev/shm
name: dshm
{% endfor %}
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
#################### BASE BUILD IMAGE ####################
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
RUN apt-get update -y \
&& apt-get install -y python3-pip
&& apt-get install -y python3-pip git
WORKDIR /workspace
......@@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt
#################### BASE BUILD IMAGE ####################
# image to build pytorch extensions
#################### EXTENSION BUILD IMAGE ####################
FROM dev AS build
# install build dependencies
......@@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt
COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py
# cuda arch list used by torch
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# max jobs used by Ninja to build extensions
......@@ -40,18 +47,26 @@ ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
RUN python3 setup.py build_ext --inplace
#################### EXTENSION Build IMAGE ####################
#################### TEST IMAGE ####################
# image to run unit testing suite
FROM dev AS test
# copy pytorch extensions separately to avoid having to rebuild
# when python code changes
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY tests tests
COPY vllm vllm
WORKDIR /vllm-workspace
# ADD is used to preserve directory structure
ADD . /vllm-workspace/
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
# ignore build dependencies installation because we are using pre-complied extensions
RUN rm pyproject.toml
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
#################### TEST IMAGE ####################
ENTRYPOINT ["python3", "-m", "pytest", "tests"]
#################### RUNTIME BASE IMAGE ####################
# use CUDA base as CUDA runtime dependencies are already installed via pip
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
......@@ -63,14 +78,10 @@ WORKDIR /workspace
COPY requirements.txt requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt
#################### RUNTIME BASE IMAGE ####################
FROM vllm-base AS vllm
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm
EXPOSE 8000
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
#################### OPENAI API SERVER ####################
# openai api server alternative
FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
......@@ -81,4 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
......@@ -16,6 +16,15 @@ Easy, fast, and cheap LLM serving for everyone
---
**The Second vLLM Bay Area Meetup (Jan 31st 5pm-7:30pm PT)**
We are thrilled to announce our second vLLM Meetup!
The vLLM team will share recent updates and roadmap.
We will also have vLLM collaborators from IBM coming up to the stage to discuss their insights on LLM optimizations.
Please register [here](https://lu.ma/ygxbpzhl) and join us!
---
*Latest News* 🔥
- [2023/12] Added ROCm support to vLLM.
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
......@@ -98,4 +107,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
year={2023}
}
```
```
\ No newline at end of file
......@@ -9,11 +9,15 @@
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
import os
import sys
from sphinx.ext import autodoc
import logging
sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
logger = logging.getLogger(__name__)
# -- Project information -----------------------------------------------------
......@@ -21,7 +25,6 @@ project = 'vLLM'
copyright = '2023, vLLM Team'
author = 'the vLLM Team'
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
......@@ -32,6 +35,8 @@ extensions = [
"sphinx.ext.viewcode",
"sphinx.ext.intersphinx",
"sphinx_copybutton",
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
]
# Add any paths that contain templates here, relative to this directory.
......@@ -55,7 +60,6 @@ html_title = project
html_theme = 'sphinx_book_theme'
html_logo = 'assets/logos/vllm-logo-text-light.png'
html_theme_options = {
'logo_only': True,
'path_to_docs': 'docs/source',
'repository_url': 'https://github.com/vllm-project/vllm',
'use_repository_button': True,
......@@ -64,4 +68,29 @@ html_theme_options = {
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# html_static_path = ['_static']
# Mock out external dependencies here.
autodoc_mock_imports = [
"torch", "transformers", "psutil", "aioprometheus", "sentencepiece",
"vllm.cuda_utils", "vllm._C"
]
for mock_target in autodoc_mock_imports:
if mock_target in sys.modules:
logger.info(
f"Potentially problematic mock target ({mock_target}) found; "
"autodoc_mock_imports cannot mock modules that have already "
"been loaded into sys.modules when the sphinx build starts.")
class MockedClassDocumenter(autodoc.ClassDocumenter):
"""Remove note about base class when a class is derived from object."""
def add_line(self, line: str, source: str, *lineno: int) -> None:
if line == " Bases: :py:class:`object`":
return
super().add_line(line, source, *lineno)
autodoc.ClassDocumenter = MockedClassDocumenter
AsyncLLMEngine
=================================
.. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine
:members: generate, abort
:show-inheritance:
vLLM Engine
=================================
.. automodule:: vllm.engine
.. currentmodule:: vllm.engine
.. toctree::
:maxdepth: 2
:caption: Engines
llm_engine
async_llm_engine
LLMEngine
=================================
.. autoclass:: vllm.engine.llm_engine.LLMEngine
:members: add_request, abort_request, step, _init_cache
:show-inheritance:
\ No newline at end of file
......@@ -11,6 +11,14 @@ This guide shows how to use vLLM to:
Be sure to complete the :ref:`installation instructions <installation>` before continuing with this guide.
.. note::
By default, vLLM downloads model from `HuggingFace <https://huggingface.co/>`_. If you would like to use models from `ModelScope <https://www.modelscope.cn>`_ in the following examples, please set the environment variable:
.. code-block:: shell
export VLLM_USE_MODELSCOPE=True
Offline Batched Inference
-------------------------
......@@ -40,16 +48,6 @@ Initialize vLLM's engine for offline inference with the ``LLM`` class and the `O
llm = LLM(model="facebook/opt-125m")
Use model from www.modelscope.cn
.. code-block:: shell
export VLLM_USE_MODELSCOPE=True
.. code-block:: python
llm = LLM(model="qwen/Qwen-7B-Chat", revision="v1.1.8", trust_remote_code=True)
Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
.. code-block:: python
......@@ -77,16 +75,6 @@ Start the server:
$ python -m vllm.entrypoints.api_server
Use model from www.modelscope.cn
.. code-block:: console
$ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.api_server \
$ --model="qwen/Qwen-7B-Chat" \
$ --revision="v1.1.8" \
$ --trust-remote-code
By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model.
Query the model in shell:
......@@ -107,7 +95,7 @@ OpenAI-Compatible Server
------------------------
vLLM can be deployed as a server that mimics the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_, `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_, and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the command below) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_, `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_, and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
Start the server:
......@@ -116,13 +104,6 @@ Start the server:
$ python -m vllm.entrypoints.openai.api_server \
$ --model facebook/opt-125m
Use model from www.modelscope.cn
.. code-block:: console
$ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.openai.api_server \
$ --model="qwen/Qwen-7B-Chat" --revision="v1.1.8" --trust-remote-code
By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
.. code-block:: console
......
......@@ -85,4 +85,16 @@ Documentation
:maxdepth: 1
:caption: Quantization
quantization/auto_awq
\ No newline at end of file
quantization/auto_awq
.. toctree::
:maxdepth: 2
:caption: Developer Documentation
dev/engine/engine_index
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
import argparse
from openai import OpenAI
import gradio as gr
# Argument parser setup
parser = argparse.ArgumentParser(
description='Chatbot Interface with Customizable Parameters')
parser.add_argument('--model-url',
type=str,
default='http://localhost:8000/v1',
help='Model URL')
parser.add_argument('-m',
'--model',
type=str,
required=True,
help='Model name for the chatbot')
parser.add_argument('--temp',
type=float,
default=0.8,
help='Temperature for text generation')
parser.add_argument('--stop-token-ids',
type=str,
default='',
help='Comma-separated stop token IDs')
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001)
# Parse the arguments
args = parser.parse_args()
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = args.model_url
# Create an OpenAI client to interact with the API server
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
def predict(message, history):
# Convert chat history to OpenAI format
history_openai_format = [{
"role": "system",
"content": "You are a great ai assistant."
}]
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human})
history_openai_format.append({
"role": "assistant",
"content": assistant
})
history_openai_format.append({"role": "user", "content": message})
# Create a chat completion request and send it to the API server
stream = client.chat.completions.create(
model=args.model, # Model name to use
messages=history_openai_format, # Chat history
temperature=args.temp, # Temperature for text generation
stream=True, # Stream response
extra_body={
'repetition_penalty':
1,
'stop_token_ids': [
int(id.strip()) for id in args.stop_token_ids.split(',')
if id.strip()
] if args.stop_token_ids else []
})
# Read and return generated text from response stream
partial_message = ""
for chunk in stream:
partial_message += (chunk.choices[0].delta.content or "")
yield partial_message
# Create and launch a chat interface with Gradio
gr.ChatInterface(predict).queue().launch(server_name=args.host,
server_port=args.port,
share=True)
{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
{% for message in messages %}
{% if message['role'] == 'user' %}
<reserved_106>
{{ message['content']|trim -}}
{% if not loop.last %}
{% endif %}
{% elif message['role'] == 'assistant' %}
<reserved_107>
{{ message['content']|trim -}}
{% if not loop.last %}
{% endif %}
{% endif %}
{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
<reserved_107>
{% endif %}
\ No newline at end of file
......@@ -13,4 +13,6 @@ types-setuptools
pytest
pytest-forked
pytest-asyncio
httpx
einops # required for MPT
flash_attn # required for HuggingFace's llama implementation
......@@ -352,6 +352,11 @@ def get_requirements() -> List[str]:
return requirements
package_data = {"vllm": ["py.typed"]}
if os.environ.get("VLLM_USE_PRECOMPILED"):
ext_modules = []
package_data["vllm"].append("*.so")
setuptools.setup(
name="vllm",
version=get_vllm_version(),
......@@ -380,5 +385,5 @@ setuptools.setup(
install_requires=get_requirements(),
ext_modules=ext_modules,
cmdclass={"build_ext": BuildExtension},
package_data={"vllm": ["py.typed"]},
package_data=package_data,
)
......@@ -29,8 +29,13 @@ def api_server():
script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute()
uvicorn_process = subprocess.Popen([
sys.executable, "-u",
str(script_path), "--model", "facebook/opt-125m"
sys.executable,
"-u",
str(script_path),
"--model",
"facebook/opt-125m",
"--host",
"127.0.0.1",
])
yield
uvicorn_process.terminate()
......@@ -81,6 +86,9 @@ def test_api_server(api_server):
pool.join()
# check cancellation stats
# give it some times to update the stats
time.sleep(1)
num_aborted_requests = requests.get(
"http://localhost:8000/stats").json()["num_aborted_requests"]
assert num_aborted_requests > 0
......
from argparse import Namespace
from dataclasses import dataclass
import os
import pathlib
import pytest
from fastapi.testclient import TestClient
from vllm.entrypoints.openai.api_server import *
chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
__file__))).parent.parent / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT = [
("facebook/opt-125m", None, True,
"Hello</s>Hi there!</s>What is the capital of</s>"),
("facebook/opt-125m", None, False,
"Hello</s>Hi there!</s>What is the capital of</s>"),
("facebook/opt-125m", "../../examples/template_chatml.jinja", True,
"""<|im_start|>user
("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
......@@ -21,8 +26,7 @@ Hi there!<|im_end|>
What is the capital of<|im_end|>
<|im_start|>assistant
"""),
("facebook/opt-125m", "../../examples/template_chatml.jinja", False,
"""<|im_start|>user
("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
......@@ -54,8 +58,7 @@ class MockTokenizer:
def test_load_chat_template():
# Testing chatml template
template = "../../examples/template_chatml.jinja"
mock_args = Namespace(chat_template=template)
mock_args = Namespace(chat_template=chatml_jinja_path)
tokenizer = MockTokenizer()
# Call the function with the mocked args
......
......@@ -2,10 +2,9 @@
Run `pytest tests/distributed/test_comm_ops.py --forked`.
"""
from multiprocessing import Process, set_start_method
import pytest
import torch
import ray
from vllm.config import ParallelConfig
from vllm.utils import get_open_port
......@@ -23,11 +22,11 @@ def init_test_distributed_environment(pipeline_parallel_size: int,
tensor_parallel_size,
worker_use_ray=True)
distributed_init_method = f"tcp://localhost:{distributed_init_port}"
torch.cuda.set_device(rank)
_init_distributed_environment(parallel_config, rank,
distributed_init_method)
@ray.remote(num_gpus=1, max_calls=1)
def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str):
init_test_distributed_environment(1, tensor_parallel_size, rank,
......@@ -43,6 +42,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
assert torch.allclose(t, expected)
@ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str):
init_test_distributed_environment(1, tensor_parallel_size, rank,
......@@ -70,14 +70,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
@pytest.mark.parametrize("test_target",
[all_reduce_test_worker, all_gather_test_worker])
def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
set_start_method("spawn", force=True)
# Using ray helps debugging the error when it failed
# as compared to multiprocessing.
ray.init()
distributed_init_port = get_open_port()
processes = []
refs = []
for rank in range(tensor_parallel_size):
p = Process(target=test_target,
args=(tensor_parallel_size, rank, distributed_init_port))
p.start()
processes.append(p)
for p in processes:
p.join()
assert all(p.exitcode == 0 for p in processes)
refs.append(
test_target.remote(tensor_parallel_size, rank,
distributed_init_port))
ray.get(refs)
ray.shutdown()
......@@ -13,7 +13,7 @@ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.
# - 512 as a buffer
MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
NUM_BLOCKS = 40000 # Arbitrary values for testing
NUM_BLOCKS = 12000 # Arbitrary values for testing
PARTITION_SIZE = 512
DTYPES = [torch.half, torch.bfloat16, torch.float]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment