Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
898cdf03
Unverified
Commit
898cdf03
authored
Jan 06, 2025
by
Liangfu Chen
Committed by
GitHub
Jan 06, 2025
Browse files
[CI] Fix neuron CI and run offline tests (#11779)
Signed-off-by:
Liangfu Chen
<
liangfc@amazon.com
>
parent
0f3f3c86
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
35 additions
and
37 deletions
+35
-37
.buildkite/run-neuron-test.sh
.buildkite/run-neuron-test.sh
+27
-26
Dockerfile.neuron
Dockerfile.neuron
+6
-2
examples/offline_inference_neuron.py
examples/offline_inference_neuron.py
+2
-9
No files found.
.buildkite/run-neuron-test.sh
View file @
898cdf03
...
@@ -3,6 +3,18 @@
...
@@ -3,6 +3,18 @@
# This script build the Neuron docker image and run the API server inside the container.
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
# It serves a sanity check for compilation and basic model usage.
set
-e
set
-e
set
-v
image_name
=
"neuron/vllm-ci"
container_name
=
"neuron_
$(
tr
-dc
A-Za-z0-9 < /dev/urandom |
head
-c
10
;
echo
)
"
HF_CACHE
=
"
$(
realpath
~
)
/huggingface"
mkdir
-p
"
${
HF_CACHE
}
"
HF_MOUNT
=
"/root/.cache/huggingface"
NEURON_COMPILE_CACHE_URL
=
"
$(
realpath
~
)
/neuron_compile_cache"
mkdir
-p
"
${
NEURON_COMPILE_CACHE_URL
}
"
NEURON_COMPILE_CACHE_MOUNT
=
"/root/.cache/neuron_compile_cache"
# Try building the docker image
# Try building the docker image
aws ecr get-login-password
--region
us-west-2 | docker login
--username
AWS
--password-stdin
763104351884.dkr.ecr.us-west-2.amazonaws.com
aws ecr get-login-password
--region
us-west-2 | docker login
--username
AWS
--password-stdin
763104351884.dkr.ecr.us-west-2.amazonaws.com
...
@@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
...
@@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
last_build
=
$(
cat
/tmp/neuron-docker-build-timestamp
)
last_build
=
$(
cat
/tmp/neuron-docker-build-timestamp
)
current_time
=
$(
date
+%s
)
current_time
=
$(
date
+%s
)
if
[
$((
current_time
-
last_build
))
-gt
86400
]
;
then
if
[
$((
current_time
-
last_build
))
-gt
86400
]
;
then
docker image prune
-f
docker system prune
-f
docker system prune
-f
rm
-rf
"
${
HF_MOUNT
:?
}
/*"
rm
-rf
"
${
NEURON_COMPILE_CACHE_MOUNT
:?
}
/*"
echo
"
$current_time
"
>
/tmp/neuron-docker-build-timestamp
echo
"
$current_time
"
>
/tmp/neuron-docker-build-timestamp
fi
fi
else
else
date
"+%s"
>
/tmp/neuron-docker-build-timestamp
date
"+%s"
>
/tmp/neuron-docker-build-timestamp
fi
fi
docker build
-t
neuron
-f
Dockerfile.neuron
.
docker build
-t
"
${
image_name
}
"
-f
Dockerfile.neuron
.
# Setup cleanup
# Setup cleanup
remove_docker_container
()
{
docker
rm
-f
neuron
||
true
;
}
remove_docker_container
()
{
docker image
rm
-f
"
${
image_name
}
"
||
true
;
}
trap
remove_docker_container EXIT
trap
remove_docker_container EXIT
remove_docker_container
# Run the image
# Run the image
docker run
--device
=
/dev/neuron0
--device
=
/dev/neuron1
--network
host
--name
neuron neuron python3
-m
vllm.entrypoints.api_server
\
docker run
--rm
-it
--device
=
/dev/neuron0
--device
=
/dev/neuron1
--network
host
\
--model
TinyLlama/TinyLlama-1.1B-Chat-v1.0
--max-num-seqs
8
--max-model-len
128
--block-size
128
--device
neuron
--tensor-parallel-size
2 &
-v
"
${
HF_CACHE
}
:
${
HF_MOUNT
}
"
\
-e
"HF_HOME=
${
HF_MOUNT
}
"
\
# Wait for the server to start
-v
"
${
NEURON_COMPILE_CACHE_URL
}
:
${
NEURON_COMPILE_CACHE_MOUNT
}
"
\
wait_for_server_to_start
()
{
-e
"NEURON_COMPILE_CACHE_URL=
${
NEURON_COMPILE_CACHE_MOUNT
}
"
\
timeout
=
300
--name
"
${
container_name
}
"
\
counter
=
0
${
image_name
}
\
/bin/bash
-c
"python3 /workspace/vllm/examples/offline_inference_neuron.py"
while
[
"
$(
curl
-s
-o
/dev/null
-w
'%{http_code}'
localhost:8000/health
)
"
!=
"200"
]
;
do
sleep
1
counter
=
$((
counter
+
1
))
if
[
$counter
-ge
$timeout
]
;
then
echo
"Timeout after
$timeout
seconds"
break
fi
done
}
wait_for_server_to_start
# Test a simple prompt
curl
-X
POST
-H
"Content-Type: application/json"
\
localhost:8000/generate
\
-d
'{"prompt": "San Francisco is a"}'
Dockerfile.neuron
View file @
898cdf03
...
@@ -15,8 +15,8 @@ RUN apt-get update && \
...
@@ -15,8 +15,8 @@ RUN apt-get update && \
ffmpeg libsm6 libxext6 libgl1
ffmpeg libsm6 libxext6 libgl1
### Mount Point ###
### Mount Point ###
# When launching the container, mount the code directory to /
app
# When launching the container, mount the code directory to /
workspace
ARG APP_MOUNT=/
app
ARG APP_MOUNT=/
workspace
VOLUME [ ${APP_MOUNT} ]
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}/vllm
WORKDIR ${APP_MOUNT}/vllm
...
@@ -25,6 +25,7 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
...
@@ -25,6 +25,7 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install pytest
COPY . .
COPY . .
ARG GIT_REPO_CHECK=0
ARG GIT_REPO_CHECK=0
...
@@ -42,4 +43,7 @@ RUN --mount=type=bind,source=.git,target=.git \
...
@@ -42,4 +43,7 @@ RUN --mount=type=bind,source=.git,target=.git \
# install development dependencies (for testing)
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
RUN python3 -m pip install -e tests/vllm_test_utils
# overwrite entrypoint to run bash script
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
CMD ["/bin/bash"]
CMD ["/bin/bash"]
examples/offline_inference_neuron.py
View file @
898cdf03
import
os
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
# creates XLA hlo graphs for all the context length buckets.
os
.
environ
[
'NEURON_CONTEXT_LENGTH_BUCKETS'
]
=
"128,512,1024,2048"
# creates XLA hlo graphs for all the token gen buckets.
os
.
environ
[
'NEURON_TOKEN_GEN_BUCKETS'
]
=
"128,512,1024,2048"
# Sample prompts.
# Sample prompts.
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -26,8 +19,8 @@ llm = LLM(
...
@@ -26,8 +19,8 @@ llm = LLM(
# Currently, this is a known limitation in continuous batching support
# Currently, this is a known limitation in continuous batching support
# in transformers-neuronx.
# in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len
=
2048
,
max_model_len
=
1024
,
block_size
=
2048
,
block_size
=
1024
,
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
# or explicitly assigned.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment