Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cd2f63fb
Unverified
Commit
cd2f63fb
authored
Apr 18, 2024
by
Liangfu Chen
Committed by
GitHub
Apr 18, 2024
Browse files
[CI/CD] add neuron docker and ci test scripts (#3571)
parent
87fa80c9
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
103 additions
and
4 deletions
+103
-4
.buildkite/run-neuron-test.sh
.buildkite/run-neuron-test.sh
+37
-0
.buildkite/test-template.j2
.buildkite/test-template.j2
+5
-0
Dockerfile.neuron
Dockerfile.neuron
+36
-0
setup.py
setup.py
+2
-1
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+2
-2
vllm/executor/neuron_executor.py
vllm/executor/neuron_executor.py
+21
-1
No files found.
.buildkite/run-neuron-test.sh
0 → 100644
View file @
cd2f63fb
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set
-e
# Try building the docker image
aws ecr get-login-password
--region
us-west-2 | docker login
--username
AWS
--password-stdin
763104351884.dkr.ecr.us-west-2.amazonaws.com
docker build
-t
neuron
-f
Dockerfile.neuron
.
# Setup cleanup
remove_docker_container
()
{
docker
rm
-f
neuron
||
true
;
}
trap
remove_docker_container EXIT
remove_docker_container
# Run the image
docker run
--device
=
/dev/neuron0
--device
=
/dev/neuron1
--network
host
--name
neuron neuron python3
-m
vllm.entrypoints.api_server
\
--model
TinyLlama/TinyLlama-1.1B-Chat-v1.0
--max-num-seqs
8
--max-model-len
128
--block-size
128
--device
neuron
--tensor-parallel-size
2 &
# Wait for the server to start
wait_for_server_to_start
()
{
timeout
=
300
counter
=
0
while
[
"
$(
curl
-s
-o
/dev/null
-w
''
%
{
http_code
}
''
localhost:8000/health
)
"
!=
"200"
]
;
do
sleep
1
counter
=
$((
counter
+
1
))
if
[
$counter
-ge
$timeout
]
;
then
echo
"Timeout after
$timeout
seconds"
break
fi
done
}
wait_for_server_to_start
# Test a simple prompt
curl
-X
POST
-H
"Content-Type: application/json"
\
localhost:8000/generate
\
-d
'{"prompt": "San Francisco is a"}'
.buildkite/test-template.j2
View file @
cd2f63fb
...
@@ -21,6 +21,11 @@ steps:
...
@@ -21,6 +21,11 @@ steps:
queue: amd
queue: amd
command: bash .buildkite/run-amd-test.sh
command: bash .buildkite/run-amd-test.sh
- label: "Neuron Test"
agents:
queue: neuron
command: bash .buildkite/run-neuron-test.sh
- label: "CPU Test"
- label: "CPU Test"
command: bash .buildkite/run-cpu-test.sh
command: bash .buildkite/run-cpu-test.sh
...
...
Dockerfile.neuron
0 → 100644
View file @
cd2f63fb
# default base image
ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
FROM $BASE_IMAGE
RUN echo "Base image is $BASE_IMAGE"
# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
### Mount Point ###
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
COPY ./vllm /app/vllm/vllm
COPY ./setup.py /app/vllm/setup.py
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
RUN cd /app/vllm \
&& python3 -m pip install -U -r requirements-neuron.txt
ENV VLLM_BUILD_WITH_NEURON 1
RUN cd /app/vllm \
&& pip install -e . \
&& cd ..
CMD ["/bin/bash"]
setup.py
View file @
cd2f63fb
...
@@ -204,7 +204,8 @@ def _is_neuron() -> bool:
...
@@ -204,7 +204,8 @@ def _is_neuron() -> bool:
subprocess
.
run
([
"neuron-ls"
],
capture_output
=
True
,
check
=
True
)
subprocess
.
run
([
"neuron-ls"
],
capture_output
=
True
,
check
=
True
)
except
(
FileNotFoundError
,
PermissionError
,
subprocess
.
CalledProcessError
):
except
(
FileNotFoundError
,
PermissionError
,
subprocess
.
CalledProcessError
):
torch_neuronx_installed
=
False
torch_neuronx_installed
=
False
return
torch_neuronx_installed
return
torch_neuronx_installed
or
os
.
environ
.
get
(
"VLLM_BUILD_WITH_NEURON"
,
False
)
def
_is_cpu
()
->
bool
:
def
_is_cpu
()
->
bool
:
...
...
vllm/engine/async_llm_engine.py
View file @
cd2f63fb
...
@@ -335,8 +335,8 @@ class AsyncLLMEngine:
...
@@ -335,8 +335,8 @@ class AsyncLLMEngine:
engine_config
=
engine_args
.
create_engine_config
()
engine_config
=
engine_args
.
create_engine_config
()
if
engine_config
.
device_config
.
device_type
==
"neuron"
:
if
engine_config
.
device_config
.
device_type
==
"neuron"
:
raise
NotImplementedError
(
"Neuron is not supported for "
from
vllm.executor.neuron_executor
import
NeuronExecutorAsync
"async engine yet."
)
executor_class
=
NeuronExecutorAsync
elif
engine_config
.
parallel_config
.
worker_use_ray
:
elif
engine_config
.
parallel_config
.
worker_use_ray
:
initialize_ray_cluster
(
engine_config
.
parallel_config
)
initialize_ray_cluster
(
engine_config
.
parallel_config
)
from
vllm.executor.ray_gpu_executor
import
RayGPUExecutorAsync
from
vllm.executor.ray_gpu_executor
import
RayGPUExecutorAsync
...
...
vllm/executor/neuron_executor.py
View file @
cd2f63fb
from
typing
import
Dict
,
List
,
Set
,
Tuple
from
typing
import
Dict
,
List
,
Set
,
Tuple
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.executor.executor_base
import
ExecutorAsyncBase
,
ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
SamplerOutput
,
SequenceGroupMetadata
from
vllm.sequence
import
SamplerOutput
,
SequenceGroupMetadata
from
vllm.utils
import
make_async
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -73,3 +74,22 @@ class NeuronExecutor(ExecutorBase):
...
@@ -73,3 +74,22 @@ class NeuronExecutor(ExecutorBase):
# NeuronExecutor will always be healthy as long as
# NeuronExecutor will always be healthy as long as
# it's running.
# it's running.
return
return
class
NeuronExecutorAsync
(
NeuronExecutor
,
ExecutorAsyncBase
):
async
def
execute_model_async
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
blocks_to_swap_in
:
Dict
[
int
,
int
],
blocks_to_swap_out
:
Dict
[
int
,
int
],
blocks_to_copy
:
Dict
[
int
,
List
[
int
]],
)
->
SamplerOutput
:
output
=
await
make_async
(
self
.
driver_worker
.
execute_model
)(
seq_group_metadata_list
=
seq_group_metadata_list
,
)
return
output
async
def
check_health_async
(
self
)
->
None
:
# NeuronExecutor will always be healthy as long as
# it's running.
return
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment