Commit ead94d93 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/main'

parents fcffb7c8 f780504d
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
set -ex
# cd into parent directory of this file
cd "$(dirname "${BASH_SOURCE[0]}")/.."
# run benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
# write the results into a markdown file
echo "### Latency Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_latency.txt >> benchmark_results.md
echo "" >> benchmark_results.md
sed -n '$p' benchmark_latency.txt >> benchmark_results.md
echo "### Throughput Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_throughput.txt >> benchmark_results.md
echo "" >> benchmark_results.md
sed -n '$p' benchmark_throughput.txt >> benchmark_results.md
# upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template.j2` to generate
# the final pipeline yaml file.
steps:
- label: Regression Test
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: AsyncEngine Test
command: pytest -v -s async_engine
- label: Distributed Test
command: pytest -v -s test_comm_ops.py
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now.
- label: Engine Test
command: pytest -v -s engine
- label: Kernels Test
command: pytest -v -s kernels
soft_fail: true
- label: Models Test
commands:
- pytest -v -s models --forked
soft_fail: true
- label: Samplers Test
command: pytest -v -s samplers --forked
- label: Worker Test
command: pytest -v -s worker
- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
commands:
- pip install aiohttp
- bash run-benchmarks.sh
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
{% set default_num_gpu = 1 %}
{% set default_working_dir = "/vllm-workspace/tests" %}
steps:
- label: ":docker: build image"
commands:
- "docker build --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
DOCKER_BUILDKIT: "1"
- wait
{% for step in steps %}
- label: "{{ step.label }}"
agents:
queue: kubernetes
soft_fail: {{ step.soft_fail or false }}
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
plugins:
- kubernetes:
podSpec:
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- image: "{{ docker_image }}"
command: ["bash"]
args:
- "-c"
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
resources:
requests:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
limits:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
volumeMounts:
- mountPath: /dev/shm
name: dshm
{% endfor %}
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
#################### BASE BUILD IMAGE ####################
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
RUN apt-get update -y \ RUN apt-get update -y \
&& apt-get install -y python3-pip && apt-get install -y python3-pip git
WORKDIR /workspace WORKDIR /workspace
...@@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ...@@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
COPY requirements-dev.txt requirements-dev.txt COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt pip install -r requirements-dev.txt
#################### BASE BUILD IMAGE ####################
# image to build pytorch extensions #################### EXTENSION BUILD IMAGE ####################
FROM dev AS build FROM dev AS build
# install build dependencies # install build dependencies
...@@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt ...@@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt
COPY pyproject.toml pyproject.toml COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py COPY vllm/__init__.py vllm/__init__.py
# cuda arch list used by torch
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# max jobs used by Ninja to build extensions # max jobs used by Ninja to build extensions
...@@ -40,18 +47,26 @@ ARG nvcc_threads=8 ...@@ -40,18 +47,26 @@ ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads ENV NVCC_THREADS=$nvcc_threads
RUN python3 setup.py build_ext --inplace RUN python3 setup.py build_ext --inplace
#################### EXTENSION Build IMAGE ####################
#################### TEST IMAGE ####################
# image to run unit testing suite # image to run unit testing suite
FROM dev AS test FROM dev AS test
# copy pytorch extensions separately to avoid having to rebuild # copy pytorch extensions separately to avoid having to rebuild
# when python code changes # when python code changes
COPY --from=build /workspace/vllm/*.so /workspace/vllm/ WORKDIR /vllm-workspace
COPY tests tests # ADD is used to preserve directory structure
COPY vllm vllm ADD . /vllm-workspace/
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
# ignore build dependencies installation because we are using pre-complied extensions
RUN rm pyproject.toml
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
#################### TEST IMAGE ####################
ENTRYPOINT ["python3", "-m", "pytest", "tests"]
#################### RUNTIME BASE IMAGE ####################
# use CUDA base as CUDA runtime dependencies are already installed via pip # use CUDA base as CUDA runtime dependencies are already installed via pip
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
...@@ -63,14 +78,10 @@ WORKDIR /workspace ...@@ -63,14 +78,10 @@ WORKDIR /workspace
COPY requirements.txt requirements.txt COPY requirements.txt requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt pip install -r requirements.txt
#################### RUNTIME BASE IMAGE ####################
FROM vllm-base AS vllm
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm
EXPOSE 8000
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
#################### OPENAI API SERVER ####################
# openai api server alternative # openai api server alternative
FROM vllm-base AS vllm-openai FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server # install additional dependencies for openai api server
...@@ -81,4 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/ ...@@ -81,4 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm COPY vllm vllm
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
...@@ -16,6 +16,15 @@ Easy, fast, and cheap LLM serving for everyone ...@@ -16,6 +16,15 @@ Easy, fast, and cheap LLM serving for everyone
--- ---
**The Second vLLM Bay Area Meetup (Jan 31st 5pm-7:30pm PT)**
We are thrilled to announce our second vLLM Meetup!
The vLLM team will share recent updates and roadmap.
We will also have vLLM collaborators from IBM coming up to the stage to discuss their insights on LLM optimizations.
Please register [here](https://lu.ma/ygxbpzhl) and join us!
---
*Latest News* 🔥 *Latest News* 🔥
- [2023/12] Added ROCm support to vLLM. - [2023/12] Added ROCm support to vLLM.
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing). - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
...@@ -98,4 +107,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ...@@ -98,4 +107,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles}, booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
year={2023} year={2023}
} }
``` ```
\ No newline at end of file
...@@ -9,11 +9,15 @@ ...@@ -9,11 +9,15 @@
# If extensions (or modules to document with autodoc) are in another directory, # If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the # add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here. # documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
import os
import sys
from sphinx.ext import autodoc
import logging
sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
logger = logging.getLogger(__name__)
# -- Project information ----------------------------------------------------- # -- Project information -----------------------------------------------------
...@@ -21,7 +25,6 @@ project = 'vLLM' ...@@ -21,7 +25,6 @@ project = 'vLLM'
copyright = '2023, vLLM Team' copyright = '2023, vLLM Team'
author = 'the vLLM Team' author = 'the vLLM Team'
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be # Add any Sphinx extension module names here, as strings. They can be
...@@ -32,6 +35,8 @@ extensions = [ ...@@ -32,6 +35,8 @@ extensions = [
"sphinx.ext.viewcode", "sphinx.ext.viewcode",
"sphinx.ext.intersphinx", "sphinx.ext.intersphinx",
"sphinx_copybutton", "sphinx_copybutton",
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
] ]
# Add any paths that contain templates here, relative to this directory. # Add any paths that contain templates here, relative to this directory.
...@@ -55,7 +60,6 @@ html_title = project ...@@ -55,7 +60,6 @@ html_title = project
html_theme = 'sphinx_book_theme' html_theme = 'sphinx_book_theme'
html_logo = 'assets/logos/vllm-logo-text-light.png' html_logo = 'assets/logos/vllm-logo-text-light.png'
html_theme_options = { html_theme_options = {
'logo_only': True,
'path_to_docs': 'docs/source', 'path_to_docs': 'docs/source',
'repository_url': 'https://github.com/vllm-project/vllm', 'repository_url': 'https://github.com/vllm-project/vllm',
'use_repository_button': True, 'use_repository_button': True,
...@@ -64,4 +68,29 @@ html_theme_options = { ...@@ -64,4 +68,29 @@ html_theme_options = {
# Add any paths that contain custom static files (such as style sheets) here, # Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files, # relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css". # so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static'] # html_static_path = ['_static']
# Mock out external dependencies here.
autodoc_mock_imports = [
"torch", "transformers", "psutil", "aioprometheus", "sentencepiece",
"vllm.cuda_utils", "vllm._C"
]
for mock_target in autodoc_mock_imports:
if mock_target in sys.modules:
logger.info(
f"Potentially problematic mock target ({mock_target}) found; "
"autodoc_mock_imports cannot mock modules that have already "
"been loaded into sys.modules when the sphinx build starts.")
class MockedClassDocumenter(autodoc.ClassDocumenter):
"""Remove note about base class when a class is derived from object."""
def add_line(self, line: str, source: str, *lineno: int) -> None:
if line == " Bases: :py:class:`object`":
return
super().add_line(line, source, *lineno)
autodoc.ClassDocumenter = MockedClassDocumenter
AsyncLLMEngine
=================================
.. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine
:members: generate, abort
:show-inheritance:
vLLM Engine
=================================
.. automodule:: vllm.engine
.. currentmodule:: vllm.engine
.. toctree::
:maxdepth: 2
:caption: Engines
llm_engine
async_llm_engine
LLMEngine
=================================
.. autoclass:: vllm.engine.llm_engine.LLMEngine
:members: add_request, abort_request, step, _init_cache
:show-inheritance:
\ No newline at end of file
...@@ -11,6 +11,14 @@ This guide shows how to use vLLM to: ...@@ -11,6 +11,14 @@ This guide shows how to use vLLM to:
Be sure to complete the :ref:`installation instructions <installation>` before continuing with this guide. Be sure to complete the :ref:`installation instructions <installation>` before continuing with this guide.
.. note::
By default, vLLM downloads model from `HuggingFace <https://huggingface.co/>`_. If you would like to use models from `ModelScope <https://www.modelscope.cn>`_ in the following examples, please set the environment variable:
.. code-block:: shell
export VLLM_USE_MODELSCOPE=True
Offline Batched Inference Offline Batched Inference
------------------------- -------------------------
...@@ -40,16 +48,6 @@ Initialize vLLM's engine for offline inference with the ``LLM`` class and the `O ...@@ -40,16 +48,6 @@ Initialize vLLM's engine for offline inference with the ``LLM`` class and the `O
llm = LLM(model="facebook/opt-125m") llm = LLM(model="facebook/opt-125m")
Use model from www.modelscope.cn
.. code-block:: shell
export VLLM_USE_MODELSCOPE=True
.. code-block:: python
llm = LLM(model="qwen/Qwen-7B-Chat", revision="v1.1.8", trust_remote_code=True)
Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens. Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
.. code-block:: python .. code-block:: python
...@@ -77,16 +75,6 @@ Start the server: ...@@ -77,16 +75,6 @@ Start the server:
$ python -m vllm.entrypoints.api_server $ python -m vllm.entrypoints.api_server
Use model from www.modelscope.cn
.. code-block:: console
$ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.api_server \
$ --model="qwen/Qwen-7B-Chat" \
$ --revision="v1.1.8" \
$ --trust-remote-code
By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model. By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model.
Query the model in shell: Query the model in shell:
...@@ -107,7 +95,7 @@ OpenAI-Compatible Server ...@@ -107,7 +95,7 @@ OpenAI-Compatible Server
------------------------ ------------------------
vLLM can be deployed as a server that mimics the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. vLLM can be deployed as a server that mimics the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_, `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_, and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints. By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the command below) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_, `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_, and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
Start the server: Start the server:
...@@ -116,13 +104,6 @@ Start the server: ...@@ -116,13 +104,6 @@ Start the server:
$ python -m vllm.entrypoints.openai.api_server \ $ python -m vllm.entrypoints.openai.api_server \
$ --model facebook/opt-125m $ --model facebook/opt-125m
Use model from www.modelscope.cn
.. code-block:: console
$ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.openai.api_server \
$ --model="qwen/Qwen-7B-Chat" --revision="v1.1.8" --trust-remote-code
By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument: By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
.. code-block:: console .. code-block:: console
......
...@@ -85,4 +85,16 @@ Documentation ...@@ -85,4 +85,16 @@ Documentation
:maxdepth: 1 :maxdepth: 1
:caption: Quantization :caption: Quantization
quantization/auto_awq quantization/auto_awq
\ No newline at end of file
.. toctree::
:maxdepth: 2
:caption: Developer Documentation
dev/engine/engine_index
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
import argparse
from openai import OpenAI
import gradio as gr
# Argument parser setup
parser = argparse.ArgumentParser(
description='Chatbot Interface with Customizable Parameters')
parser.add_argument('--model-url',
type=str,
default='http://localhost:8000/v1',
help='Model URL')
parser.add_argument('-m',
'--model',
type=str,
required=True,
help='Model name for the chatbot')
parser.add_argument('--temp',
type=float,
default=0.8,
help='Temperature for text generation')
parser.add_argument('--stop-token-ids',
type=str,
default='',
help='Comma-separated stop token IDs')
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8001)
# Parse the arguments
args = parser.parse_args()
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = args.model_url
# Create an OpenAI client to interact with the API server
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
def predict(message, history):
# Convert chat history to OpenAI format
history_openai_format = [{
"role": "system",
"content": "You are a great ai assistant."
}]
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human})
history_openai_format.append({
"role": "assistant",
"content": assistant
})
history_openai_format.append({"role": "user", "content": message})
# Create a chat completion request and send it to the API server
stream = client.chat.completions.create(
model=args.model, # Model name to use
messages=history_openai_format, # Chat history
temperature=args.temp, # Temperature for text generation
stream=True, # Stream response
extra_body={
'repetition_penalty':
1,
'stop_token_ids': [
int(id.strip()) for id in args.stop_token_ids.split(',')
if id.strip()
] if args.stop_token_ids else []
})
# Read and return generated text from response stream
partial_message = ""
for chunk in stream:
partial_message += (chunk.choices[0].delta.content or "")
yield partial_message
# Create and launch a chat interface with Gradio
gr.ChatInterface(predict).queue().launch(server_name=args.host,
server_port=args.port,
share=True)
{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
{% for message in messages %}
{% if message['role'] == 'user' %}
<reserved_106>
{{ message['content']|trim -}}
{% if not loop.last %}
{% endif %}
{% elif message['role'] == 'assistant' %}
<reserved_107>
{{ message['content']|trim -}}
{% if not loop.last %}
{% endif %}
{% endif %}
{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
<reserved_107>
{% endif %}
\ No newline at end of file
...@@ -13,4 +13,6 @@ types-setuptools ...@@ -13,4 +13,6 @@ types-setuptools
pytest pytest
pytest-forked pytest-forked
pytest-asyncio pytest-asyncio
httpx
einops # required for MPT
flash_attn # required for HuggingFace's llama implementation
...@@ -352,6 +352,11 @@ def get_requirements() -> List[str]: ...@@ -352,6 +352,11 @@ def get_requirements() -> List[str]:
return requirements return requirements
package_data = {"vllm": ["py.typed"]}
if os.environ.get("VLLM_USE_PRECOMPILED"):
ext_modules = []
package_data["vllm"].append("*.so")
setuptools.setup( setuptools.setup(
name="vllm", name="vllm",
version=get_vllm_version(), version=get_vllm_version(),
...@@ -380,5 +385,5 @@ setuptools.setup( ...@@ -380,5 +385,5 @@ setuptools.setup(
install_requires=get_requirements(), install_requires=get_requirements(),
ext_modules=ext_modules, ext_modules=ext_modules,
cmdclass={"build_ext": BuildExtension}, cmdclass={"build_ext": BuildExtension},
package_data={"vllm": ["py.typed"]}, package_data=package_data,
) )
...@@ -29,8 +29,13 @@ def api_server(): ...@@ -29,8 +29,13 @@ def api_server():
script_path = Path(__file__).parent.joinpath( script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute() "api_server_async_engine.py").absolute()
uvicorn_process = subprocess.Popen([ uvicorn_process = subprocess.Popen([
sys.executable, "-u", sys.executable,
str(script_path), "--model", "facebook/opt-125m" "-u",
str(script_path),
"--model",
"facebook/opt-125m",
"--host",
"127.0.0.1",
]) ])
yield yield
uvicorn_process.terminate() uvicorn_process.terminate()
...@@ -81,6 +86,9 @@ def test_api_server(api_server): ...@@ -81,6 +86,9 @@ def test_api_server(api_server):
pool.join() pool.join()
# check cancellation stats # check cancellation stats
# give it some times to update the stats
time.sleep(1)
num_aborted_requests = requests.get( num_aborted_requests = requests.get(
"http://localhost:8000/stats").json()["num_aborted_requests"] "http://localhost:8000/stats").json()["num_aborted_requests"]
assert num_aborted_requests > 0 assert num_aborted_requests > 0
......
from argparse import Namespace from argparse import Namespace
from dataclasses import dataclass from dataclasses import dataclass
import os
import pathlib
import pytest import pytest
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
from vllm.entrypoints.openai.api_server import * from vllm.entrypoints.openai.api_server import *
chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
__file__))).parent.parent / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs # Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT = [ MODEL_TEMPLATE_GENERATON_OUTPUT = [
("facebook/opt-125m", None, True, ("facebook/opt-125m", None, True,
"Hello</s>Hi there!</s>What is the capital of</s>"), "Hello</s>Hi there!</s>What is the capital of</s>"),
("facebook/opt-125m", None, False, ("facebook/opt-125m", None, False,
"Hello</s>Hi there!</s>What is the capital of</s>"), "Hello</s>Hi there!</s>What is the capital of</s>"),
("facebook/opt-125m", "../../examples/template_chatml.jinja", True, ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
"""<|im_start|>user
Hello<|im_end|> Hello<|im_end|>
<|im_start|>assistant <|im_start|>assistant
Hi there!<|im_end|> Hi there!<|im_end|>
...@@ -21,8 +26,7 @@ Hi there!<|im_end|> ...@@ -21,8 +26,7 @@ Hi there!<|im_end|>
What is the capital of<|im_end|> What is the capital of<|im_end|>
<|im_start|>assistant <|im_start|>assistant
"""), """),
("facebook/opt-125m", "../../examples/template_chatml.jinja", False, ("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user
"""<|im_start|>user
Hello<|im_end|> Hello<|im_end|>
<|im_start|>assistant <|im_start|>assistant
Hi there!<|im_end|> Hi there!<|im_end|>
...@@ -54,8 +58,7 @@ class MockTokenizer: ...@@ -54,8 +58,7 @@ class MockTokenizer:
def test_load_chat_template(): def test_load_chat_template():
# Testing chatml template # Testing chatml template
template = "../../examples/template_chatml.jinja" mock_args = Namespace(chat_template=chatml_jinja_path)
mock_args = Namespace(chat_template=template)
tokenizer = MockTokenizer() tokenizer = MockTokenizer()
# Call the function with the mocked args # Call the function with the mocked args
......
...@@ -2,10 +2,9 @@ ...@@ -2,10 +2,9 @@
Run `pytest tests/distributed/test_comm_ops.py --forked`. Run `pytest tests/distributed/test_comm_ops.py --forked`.
""" """
from multiprocessing import Process, set_start_method
import pytest import pytest
import torch import torch
import ray
from vllm.config import ParallelConfig from vllm.config import ParallelConfig
from vllm.utils import get_open_port from vllm.utils import get_open_port
...@@ -23,11 +22,11 @@ def init_test_distributed_environment(pipeline_parallel_size: int, ...@@ -23,11 +22,11 @@ def init_test_distributed_environment(pipeline_parallel_size: int,
tensor_parallel_size, tensor_parallel_size,
worker_use_ray=True) worker_use_ray=True)
distributed_init_method = f"tcp://localhost:{distributed_init_port}" distributed_init_method = f"tcp://localhost:{distributed_init_port}"
torch.cuda.set_device(rank)
_init_distributed_environment(parallel_config, rank, _init_distributed_environment(parallel_config, rank,
distributed_init_method) distributed_init_method)
@ray.remote(num_gpus=1, max_calls=1)
def all_reduce_test_worker(tensor_parallel_size: int, rank: int, def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str): distributed_init_port: str):
init_test_distributed_environment(1, tensor_parallel_size, rank, init_test_distributed_environment(1, tensor_parallel_size, rank,
...@@ -43,6 +42,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int, ...@@ -43,6 +42,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
assert torch.allclose(t, expected) assert torch.allclose(t, expected)
@ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker(tensor_parallel_size: int, rank: int, def all_gather_test_worker(tensor_parallel_size: int, rank: int,
distributed_init_port: str): distributed_init_port: str):
init_test_distributed_environment(1, tensor_parallel_size, rank, init_test_distributed_environment(1, tensor_parallel_size, rank,
...@@ -70,14 +70,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int, ...@@ -70,14 +70,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
@pytest.mark.parametrize("test_target", @pytest.mark.parametrize("test_target",
[all_reduce_test_worker, all_gather_test_worker]) [all_reduce_test_worker, all_gather_test_worker])
def test_multi_process_tensor_parallel(tensor_parallel_size, test_target): def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
set_start_method("spawn", force=True) # Using ray helps debugging the error when it failed
# as compared to multiprocessing.
ray.init()
distributed_init_port = get_open_port() distributed_init_port = get_open_port()
processes = [] refs = []
for rank in range(tensor_parallel_size): for rank in range(tensor_parallel_size):
p = Process(target=test_target, refs.append(
args=(tensor_parallel_size, rank, distributed_init_port)) test_target.remote(tensor_parallel_size, rank,
p.start() distributed_init_port))
processes.append(p) ray.get(refs)
for p in processes:
p.join() ray.shutdown()
assert all(p.exitcode == 0 for p in processes)
...@@ -13,7 +13,7 @@ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 ...@@ -13,7 +13,7 @@ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability. # This will change depending on the compute capability.
# - 512 as a buffer # - 512 as a buffer
MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
NUM_BLOCKS = 40000 # Arbitrary values for testing NUM_BLOCKS = 12000 # Arbitrary values for testing
PARTITION_SIZE = 512 PARTITION_SIZE = 512
DTYPES = [torch.half, torch.bfloat16, torch.float] DTYPES = [torch.half, torch.bfloat16, torch.float]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment