[CI/CD] add neuron docker and ci test scripts (#3571)

cd2f63fb · Liangfu Chen · GitHub · 87fa80c9 · cd2f63fb · cd2f63fb
Unverified Commit cd2f63fb authored Apr 18, 2024 by Liangfu Chen Committed by GitHub Apr 18, 2024
6 changed files
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
+# This script build the Neuron docker image and run the API server inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -e
+# Try building the docker image
+aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+docker build -t neuron -f Dockerfile.neuron .
+# Setup cleanup
+remove_docker_container() { docker rm -f neuron || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+# Run the image
+docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
+       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
+# Wait for the server to start
+wait_for_server_to_start() {
+    timeout=300
+    counter=0
+    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+        sleep 1
+        counter=$((counter + 1))
+        if [ $counter -ge $timeout ]; then
+            echo "Timeout after $timeout seconds"
+            break
+        fi
+    done
+}
+wait_for_server_to_start
+# Test a simple prompt
+curl -X POST -H "Content-Type: application/json" \
+    localhost:8000/generate \
+    -d '{"prompt": "San Francisco is a"}'
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -21,6 +21,11 @@ steps:
      queue: amd
    command: bash .buildkite/run-amd-test.sh
+  - label: "Neuron Test"
+    agents:
+      queue: neuron
+    command: bash .buildkite/run-neuron-test.sh
  - label: "CPU Test"
    command: bash .buildkite/run-cpu-test.sh

--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
+# default base image
+ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
+FROM $BASE_IMAGE
+RUN echo "Base image is $BASE_IMAGE"
+# Install some basic utilities
+RUN apt-get update && apt-get install python3 python3-pip -y
+### Mount Point ###
+# When launching the container, mount the code directory to /app
+ARG APP_MOUNT=/app
+VOLUME [ ${APP_MOUNT} ]
+WORKDIR ${APP_MOUNT}
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
+RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+COPY ./vllm /app/vllm/vllm
+COPY ./setup.py /app/vllm/setup.py
+COPY ./requirements-common.txt /app/vllm/requirements-common.txt
+COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
+RUN cd /app/vllm \
+    && python3 -m pip install -U -r requirements-neuron.txt
+ENV VLLM_BUILD_WITH_NEURON 1
+RUN cd /app/vllm \
+    && pip install -e . \
+    && cd ..
+CMD ["/bin/bash"]
--- a/setup.py
+++ b/setup.py
@@ -204,7 +204,8 @@ def _is_neuron() -> bool:
        subprocess.run(["neuron-ls"], capture_output=True, check=True)
    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
        torch_neuronx_installed = False
-    return torch_neuronx_installed
+    return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON",
+                                                     False)
 def _is_cpu() -> bool:

--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -335,8 +335,8 @@ class AsyncLLMEngine:
        engine_config = engine_args.create_engine_config()
        if engine_config.device_config.device_type == "neuron":
-            raise NotImplementedError("Neuron is not supported for "
+            from vllm.executor.neuron_executor import NeuronExecutorAsync
-                                      "async engine yet.")
+            executor_class = NeuronExecutorAsync
        elif engine_config.parallel_config.worker_use_ray:
            initialize_ray_cluster(engine_config.parallel_config)
            from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync

--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
 from typing import Dict, List, Set, Tuple
-from vllm.executor.executor_base import ExecutorBase
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.utils import make_async
 logger = init_logger(__name__)
@@ -73,3 +74,22 @@ class NeuronExecutor(ExecutorBase):
        # NeuronExecutor will always be healthy as long as
        # it's running.
        return
+class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
+    async def execute_model_async(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> SamplerOutput:
+        output = await make_async(self.driver_worker.execute_model)(
+            seq_group_metadata_list=seq_group_metadata_list, )
+        return output
+    async def check_health_async(self) -> None:
+        # NeuronExecutor will always be healthy as long as
+        # it's running.
+        return