Merge remote-tracking branch 'mirror/main'

ead94d93 · zhuwenwen · fcffb7c8 · f780504d · ead94d93 · ead94d93
Commit ead94d93 authored Jan 16, 2024 by zhuwenwen
20 changed files
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
+# This script is run by buildkite to run the benchmarks and upload the results to buildkite
+set -ex
+# cd into parent directory of this file
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+# run benchmarks and upload the result to buildkite
+python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
+# write the results into a markdown file
+echo "### Latency Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_latency.txt >> benchmark_results.md
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_latency.txt >> benchmark_results.md
+echo "### Throughput Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_throughput.txt >> benchmark_results.md
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_throughput.txt >> benchmark_results.md
+# upload the results to buildkite
+/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
+# In this file, you can add more tests to run either by adding a new step or
+# adding a new command to an existing step. See different options here for examples.
+# This script will be feed into Jinja template in `test-template.j2` to generate
+# the final pipeline yaml file.
+steps:
+- label: Regression Test
+  command: pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+- label: AsyncEngine Test
+  command: pytest -v -s async_engine
+- label: Distributed Test
+  command: pytest -v -s test_comm_ops.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+- label: Engine Test
+  command: pytest -v -s engine
+- label: Kernels Test
+  command: pytest -v -s kernels
+  soft_fail: true
+- label: Models Test
+  commands:
+    - pytest -v -s models --forked
+  soft_fail: true
+- label: Samplers Test
+  command: pytest -v -s samplers --forked
+- label: Worker Test
+  command: pytest -v -s worker
+- label: Benchmarks
+  working_dir: "/vllm-workspace/.buildkite"
+  commands:
+  - pip install aiohttp
+  - bash run-benchmarks.sh
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
+{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
+{% set default_num_gpu = 1 %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+steps:
+  - label: ":docker: build image"
+    commands:
+      - "docker build --tag {{ docker_image }} --target test --progress plain ."
+      - "docker push {{ docker_image }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+  - wait
+  {% for step in steps %}
+  - label: "{{ step.label }}"
+    agents:
+      queue: kubernetes
+    soft_fail: {{ step.soft_fail or false }}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+    plugins:
+      - kubernetes:
+          podSpec:
+            volumes:
+              - name: dshm
+                emptyDir:
+                  medium: Memory
+            containers:
+              - image: "{{ docker_image }}"
+                command: ["bash"]
+                args:
+                - "-c"
+                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+                resources:
+                  requests:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                  limits:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                env:
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+  {% endfor %}
--- a/.dockerignore
+++ b/.dockerignore
+vllm/*.so
--- a/Dockerfile
+++ b/Dockerfile
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+#################### BASE BUILD IMAGE ####################
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 RUN apt-get update -y \
-    && apt-get install -y python3-pip
+    && apt-get install -y python3-pip git
 WORKDIR /workspace
@@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-dev.txt
+#################### BASE BUILD IMAGE ####################
-# image to build pytorch extensions
+#################### EXTENSION BUILD IMAGE ####################
 FROM dev AS build
 # install build dependencies
@@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm/__init__.py vllm/__init__.py
+# cuda arch list used by torch
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # max jobs used by Ninja to build extensions
@@ -40,18 +47,26 @@ ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
 RUN python3 setup.py build_ext --inplace
+#################### EXTENSION Build IMAGE ####################
+#################### TEST IMAGE ####################
 # image to run unit testing suite
 FROM dev AS test
 # copy pytorch extensions separately to avoid having to rebuild
 # when python code changes
-COPY --from=build /workspace/vllm/*.so /workspace/vllm/
+WORKDIR /vllm-workspace
-COPY tests tests
+# ADD is used to preserve directory structure
-COPY vllm vllm
+ADD . /vllm-workspace/
+COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
+# ignore build dependencies installation because we are using pre-complied extensions
+RUN rm pyproject.toml
+RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
+#################### TEST IMAGE ####################
-ENTRYPOINT ["python3", "-m", "pytest", "tests"]
+#################### RUNTIME BASE IMAGE ####################
 # use CUDA base as CUDA runtime dependencies are already installed via pip
 FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
@@ -63,14 +78,10 @@ WORKDIR /workspace
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements.txt
+#################### RUNTIME BASE IMAGE ####################
-FROM vllm-base AS vllm
-COPY --from=build /workspace/vllm/*.so /workspace/vllm/
-COPY vllm vllm
-EXPOSE 8000
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
+#################### OPENAI API SERVER ####################
 # openai api server alternative
 FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
@@ -81,4 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/
 COPY vllm vllm
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+#################### OPENAI API SERVER ####################
--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -16,6 +16,15 @@ Easy, fast, and cheap LLM serving for everyone
 ---
+**The Second vLLM Bay Area Meetup (Jan 31st 5pm-7:30pm PT)**
+We are thrilled to announce our second vLLM Meetup!
+The vLLM team will share recent updates and roadmap.
+We will also have vLLM collaborators from IBM coming up to the stage to discuss their insights on LLM optimizations.
+Please register [here](https://lu.ma/ygxbpzhl) and join us!
+---
 *Latest News* 🔥
 - [2023/12] Added ROCm support to vLLM.
 - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
@@ -98,4 +107,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
  booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
  year={2023}
 }
 ```
\ No newline at end of file
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -9,11 +9,15 @@
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
+import os
+import sys
+from sphinx.ext import autodoc
+import logging
+sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
+logger = logging.getLogger(__name__)
 # -- Project information -----------------------------------------------------
@@ -21,7 +25,6 @@ project = 'vLLM'
 copyright = '2023, vLLM Team'
 author = 'the vLLM Team'
 # -- General configuration ---------------------------------------------------
 # Add any Sphinx extension module names here, as strings. They can be
@@ -32,6 +35,8 @@ extensions = [
    "sphinx.ext.viewcode",
    "sphinx.ext.intersphinx",
    "sphinx_copybutton",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
 ]
 # Add any paths that contain templates here, relative to this directory.
@@ -55,7 +60,6 @@ html_title = project
 html_theme = 'sphinx_book_theme'
 html_logo = 'assets/logos/vllm-logo-text-light.png'
 html_theme_options = {
-    'logo_only': True,
    'path_to_docs': 'docs/source',
    'repository_url': 'https://github.com/vllm-project/vllm',
    'use_repository_button': True,
@@ -64,4 +68,29 @@ html_theme_options = {
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+# html_static_path = ['_static']
+# Mock out external dependencies here.
+autodoc_mock_imports = [
+    "torch", "transformers", "psutil", "aioprometheus", "sentencepiece",
+    "vllm.cuda_utils", "vllm._C"
+]
+for mock_target in autodoc_mock_imports:
+    if mock_target in sys.modules:
+        logger.info(
+            f"Potentially problematic mock target ({mock_target}) found; "
+            "autodoc_mock_imports cannot mock modules that have already "
+            "been loaded into sys.modules when the sphinx build starts.")
+class MockedClassDocumenter(autodoc.ClassDocumenter):
+    """Remove note about base class when a class is derived from object."""
+    def add_line(self, line: str, source: str, *lineno: int) -> None:
+        if line == "   Bases: :py:class:`object`":
+            return
+        super().add_line(line, source, *lineno)
+autodoc.ClassDocumenter = MockedClassDocumenter
--- a/docs/source/dev/engine/async_llm_engine.rst
+++ b/docs/source/dev/engine/async_llm_engine.rst
+AsyncLLMEngine
+=================================
+.. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine
+    :members: generate, abort
+    :show-inheritance:
--- a/docs/source/dev/engine/engine_index.rst
+++ b/docs/source/dev/engine/engine_index.rst
+vLLM Engine
+=================================
+.. automodule:: vllm.engine
+.. currentmodule:: vllm.engine
+.. toctree::
+   :maxdepth: 2
+   :caption: Engines
+   llm_engine
+   async_llm_engine
--- a/docs/source/dev/engine/llm_engine.rst
+++ b/docs/source/dev/engine/llm_engine.rst
+LLMEngine
+=================================
+.. autoclass:: vllm.engine.llm_engine.LLMEngine
+    :members: add_request, abort_request, step, _init_cache
+    :show-inheritance:
\ No newline at end of file
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -11,6 +11,14 @@ This guide shows how to use vLLM to:
 Be sure to complete the :ref:`installation instructions <installation>` before continuing with this guide.
+.. note::
+    By default, vLLM downloads model from `HuggingFace <https://huggingface.co/>`_. If you would like to use models from `ModelScope <https://www.modelscope.cn>`_ in the following examples, please set the environment variable:
+    .. code-block:: shell
+        export VLLM_USE_MODELSCOPE=True
 Offline Batched Inference
 -------------------------
@@ -40,16 +48,6 @@ Initialize vLLM's engine for offline inference with the ``LLM`` class and the `O
    llm = LLM(model="facebook/opt-125m")
-Use model from www.modelscope.cn
-.. code-block:: shell
-    export VLLM_USE_MODELSCOPE=True
-.. code-block:: python
-    llm = LLM(model="qwen/Qwen-7B-Chat", revision="v1.1.8", trust_remote_code=True)
 Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
 .. code-block:: python
@@ -77,16 +75,6 @@ Start the server:
    $ python -m vllm.entrypoints.api_server
-Use model from www.modelscope.cn
-.. code-block:: console
-    $ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.api_server \
-    $    --model="qwen/Qwen-7B-Chat" \
-    $    --revision="v1.1.8" \
-    $    --trust-remote-code
 By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model.
 Query the model in shell:
@@ -107,7 +95,7 @@ OpenAI-Compatible Server
 ------------------------
 vLLM can be deployed as a server that mimics the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
-By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_, `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_, and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
+By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the command below) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_, `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_, and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
 Start the server:
@@ -116,13 +104,6 @@ Start the server:
    $ python -m vllm.entrypoints.openai.api_server \
    $     --model facebook/opt-125m
-Use model from www.modelscope.cn
-.. code-block:: console
-    $ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.openai.api_server \
-    $     --model="qwen/Qwen-7B-Chat" --revision="v1.1.8" --trust-remote-code
 By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
 .. code-block:: console

--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -85,4 +85,16 @@ Documentation
   :maxdepth: 1
   :caption: Quantization
   quantization/auto_awq
\ No newline at end of file
+.. toctree::
+   :maxdepth: 2
+   :caption: Developer Documentation
+   dev/engine/engine_index
+Indices and tables
+==================
+* :ref:`genindex`
+* :ref:`modindex`
--- a/examples/gradio_openai_chatbot_webserver.py
+++ b/examples/gradio_openai_chatbot_webserver.py
+import argparse
+from openai import OpenAI
+import gradio as gr
+# Argument parser setup
+parser = argparse.ArgumentParser(
+    description='Chatbot Interface with Customizable Parameters')
+parser.add_argument('--model-url',
+                    type=str,
+                    default='http://localhost:8000/v1',
+                    help='Model URL')
+parser.add_argument('-m',
+                    '--model',
+                    type=str,
+                    required=True,
+                    help='Model name for the chatbot')
+parser.add_argument('--temp',
+                    type=float,
+                    default=0.8,
+                    help='Temperature for text generation')
+parser.add_argument('--stop-token-ids',
+                    type=str,
+                    default='',
+                    help='Comma-separated stop token IDs')
+parser.add_argument("--host", type=str, default=None)
+parser.add_argument("--port", type=int, default=8001)
+# Parse the arguments
+args = parser.parse_args()
+# Set OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = args.model_url
+# Create an OpenAI client to interact with the API server
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+def predict(message, history):
+    # Convert chat history to OpenAI format
+    history_openai_format = [{
+        "role": "system",
+        "content": "You are a great ai assistant."
+    }]
+    for human, assistant in history:
+        history_openai_format.append({"role": "user", "content": human})
+        history_openai_format.append({
+            "role": "assistant",
+            "content": assistant
+        })
+    history_openai_format.append({"role": "user", "content": message})
+    # Create a chat completion request and send it to the API server
+    stream = client.chat.completions.create(
+        model=args.model,  # Model name to use
+        messages=history_openai_format,  # Chat history
+        temperature=args.temp,  # Temperature for text generation
+        stream=True,  # Stream response
+        extra_body={
+            'repetition_penalty':
+            1,
+            'stop_token_ids': [
+                int(id.strip()) for id in args.stop_token_ids.split(',')
+                if id.strip()
+            ] if args.stop_token_ids else []
+        })
+    # Read and return generated text from response stream
+    partial_message = ""
+    for chunk in stream:
+        partial_message += (chunk.choices[0].delta.content or "")
+        yield partial_message
+# Create and launch a chat interface with Gradio
+gr.ChatInterface(predict).queue().launch(server_name=args.host,
+                                         server_port=args.port,
+                                         share=True)
--- a/examples/template_baichuan.jinja
+++ b/examples/template_baichuan.jinja
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+<reserved_106>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+{% endif %}
+{% elif message['role'] == 'assistant' %}
+<reserved_107>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+{% endif %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
+<reserved_107>
+{% endif %}
\ No newline at end of file
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -13,4 +13,6 @@ types-setuptools
 pytest
 pytest-forked
 pytest-asyncio
+httpx
+einops # required for MPT
+flash_attn # required for HuggingFace's llama implementation
--- a/setup.py
+++ b/setup.py
@@ -352,6 +352,11 @@ def get_requirements() -> List[str]:
    return requirements
+package_data = {"vllm": ["py.typed"]}
+if os.environ.get("VLLM_USE_PRECOMPILED"):
+    ext_modules = []
+    package_data["vllm"].append("*.so")
 setuptools.setup(
    name="vllm",
    version=get_vllm_version(),
@@ -380,5 +385,5 @@ setuptools.setup(
    install_requires=get_requirements(),
    ext_modules=ext_modules,
    cmdclass={"build_ext": BuildExtension},
-    package_data={"vllm": ["py.typed"]},
+    package_data=package_data,
 )
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -29,8 +29,13 @@ def api_server():
    script_path = Path(__file__).parent.joinpath(
        "api_server_async_engine.py").absolute()
    uvicorn_process = subprocess.Popen([
-        sys.executable, "-u",
+        sys.executable,
-        str(script_path), "--model", "facebook/opt-125m"
+        "-u",
+        str(script_path),
+        "--model",
+        "facebook/opt-125m",
+        "--host",
+        "127.0.0.1",
    ])
    yield
    uvicorn_process.terminate()
@@ -81,6 +86,9 @@ def test_api_server(api_server):
        pool.join()
        # check cancellation stats
+        # give it some times to update the stats
+        time.sleep(1)
        num_aborted_requests = requests.get(
            "http://localhost:8000/stats").json()["num_aborted_requests"]
        assert num_aborted_requests > 0

--- a/tests/async_engine/test_openai_server.py
+++ b/tests/async_engine/test_openai_server.py
 from argparse import Namespace
 from dataclasses import dataclass
+import os
+import pathlib
 import pytest
 from fastapi.testclient import TestClient
 from vllm.entrypoints.openai.api_server import *
+chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
+    __file__))).parent.parent / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
 # Define models, templates, and their corresponding expected outputs
 MODEL_TEMPLATE_GENERATON_OUTPUT = [
    ("facebook/opt-125m", None, True,
     "Hello</s>Hi there!</s>What is the capital of</s>"),
    ("facebook/opt-125m", None, False,
     "Hello</s>Hi there!</s>What is the capital of</s>"),
-    ("facebook/opt-125m", "../../examples/template_chatml.jinja", True,
+    ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
-     """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
@@ -21,8 +26,7 @@ Hi there!<|im_end|>
 What is the capital of<|im_end|>
 <|im_start|>assistant
 """),
-    ("facebook/opt-125m", "../../examples/template_chatml.jinja", False,
+    ("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user
-     """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
@@ -54,8 +58,7 @@ class MockTokenizer:
 def test_load_chat_template():
    # Testing chatml template
-    template = "../../examples/template_chatml.jinja"
+    mock_args = Namespace(chat_template=chatml_jinja_path)
-    mock_args = Namespace(chat_template=template)
    tokenizer = MockTokenizer()
    # Call the function with the mocked args

--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -2,10 +2,9 @@
 Run `pytest tests/distributed/test_comm_ops.py --forked`.
 """
-from multiprocessing import Process, set_start_method
 import pytest
 import torch
+import ray
 from vllm.config import ParallelConfig
 from vllm.utils import get_open_port
@@ -23,11 +22,11 @@ def init_test_distributed_environment(pipeline_parallel_size: int,
                                     tensor_parallel_size,
                                     worker_use_ray=True)
    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
-    torch.cuda.set_device(rank)
    _init_distributed_environment(parallel_config, rank,
                                  distributed_init_method)
+@ray.remote(num_gpus=1, max_calls=1)
 def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
                           distributed_init_port: str):
    init_test_distributed_environment(1, tensor_parallel_size, rank,
@@ -43,6 +42,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
    assert torch.allclose(t, expected)
+@ray.remote(num_gpus=1, max_calls=1)
 def all_gather_test_worker(tensor_parallel_size: int, rank: int,
                           distributed_init_port: str):
    init_test_distributed_environment(1, tensor_parallel_size, rank,
@@ -70,14 +70,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
 @pytest.mark.parametrize("test_target",
                         [all_reduce_test_worker, all_gather_test_worker])
 def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
-    set_start_method("spawn", force=True)
+    # Using ray helps debugging the error when it failed
+    # as compared to multiprocessing.
+    ray.init()
    distributed_init_port = get_open_port()
-    processes = []
+    refs = []
    for rank in range(tensor_parallel_size):
-        p = Process(target=test_target,
+        refs.append(
-                    args=(tensor_parallel_size, rank, distributed_init_port))
+            test_target.remote(tensor_parallel_size, rank,
-        p.start()
+                               distributed_init_port))
-        processes.append(p)
+    ray.get(refs)
-    for p in processes:
-        p.join()
+    ray.shutdown()
-    assert all(p.exitcode == 0 for p in processes)
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -13,7 +13,7 @@ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
 MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
-NUM_BLOCKS = 40000  # Arbitrary values for testing
+NUM_BLOCKS = 12000  # Arbitrary values for testing
 PARTITION_SIZE = 512
 DTYPES = [torch.half, torch.bfloat16, torch.float]