Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
53ee09c0
Unverified
Commit
53ee09c0
authored
Apr 14, 2023
by
OlivierDehaene
Committed by
GitHub
Apr 14, 2023
Browse files
fea(dockerfile): better layer caching (#159)
parent
12e5633c
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
140 additions
and
48 deletions
+140
-48
.github/workflows/build.yaml
.github/workflows/build.yaml
+4
-6
.github/workflows/client-tests.yaml
.github/workflows/client-tests.yaml
+1
-1
.github/workflows/tests.yaml
.github/workflows/tests.yaml
+1
-1
Dockerfile
Dockerfile
+113
-28
clients/python/tests/test_client.py
clients/python/tests/test_client.py
+2
-2
server/Makefile-flash-att
server/Makefile-flash-att
+11
-5
server/Makefile-transformers
server/Makefile-transformers
+8
-5
No files found.
.github/workflows/build.yaml
View file @
53ee09c0
...
@@ -26,7 +26,7 @@ concurrency:
...
@@ -26,7 +26,7 @@ concurrency:
jobs
:
jobs
:
build-and-push-image
:
build-and-push-image
:
runs-on
:
large
runs-on
:
ubuntu-latest
permissions
:
permissions
:
contents
:
write
contents
:
write
packages
:
write
packages
:
write
...
@@ -45,9 +45,7 @@ jobs:
...
@@ -45,9 +45,7 @@ jobs:
uses
:
rlespinasse/github-slug-action@v4.4.1
uses
:
rlespinasse/github-slug-action@v4.4.1
-
name
:
Install cosign
-
name
:
Install cosign
if
:
github.event_name != 'pull_request'
if
:
github.event_name != 'pull_request'
uses
:
sigstore/cosign-installer@f3c664df7af409cb4873aa5068053ba9d61a57b6
#v2.6.0
uses
:
sigstore/cosign-installer@v3.0.2
with
:
cosign-release
:
'
v1.13.1'
-
name
:
Tailscale
-
name
:
Tailscale
uses
:
tailscale/github-action@v1
uses
:
tailscale/github-action@v1
with
:
with
:
...
@@ -66,7 +64,7 @@ jobs:
...
@@ -66,7 +64,7 @@ jobs:
password
:
${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
password
:
${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
registry
:
registry.internal.huggingface.tech
registry
:
registry.internal.huggingface.tech
-
name
:
Login to Azure Container Registry
-
name
:
Login to Azure Container Registry
if
:
github.event_name != 'pull_request'
#
if: github.event_name != 'pull_request'
uses
:
docker/login-action@v2.1.0
uses
:
docker/login-action@v2.1.0
with
:
with
:
username
:
${{ secrets.AZURE_DOCKER_USERNAME }}
username
:
${{ secrets.AZURE_DOCKER_USERNAME }}
...
@@ -136,7 +134,7 @@ jobs:
...
@@ -136,7 +134,7 @@ jobs:
build-and-push-sagemaker-image
:
build-and-push-sagemaker-image
:
needs
:
needs
:
-
build-and-push-image
-
build-and-push-image
runs-on
:
large
runs-on
:
ubuntu-latest
steps
:
steps
:
-
name
:
Checkout repository
-
name
:
Checkout repository
uses
:
actions/checkout@v3
uses
:
actions/checkout@v3
...
...
.github/workflows/client-tests.yaml
View file @
53ee09c0
...
@@ -8,7 +8,7 @@ on:
...
@@ -8,7 +8,7 @@ on:
jobs
:
jobs
:
run_tests
:
run_tests
:
runs-on
:
ubuntu-
20.04
runs-on
:
ubuntu-
latest
steps
:
steps
:
-
uses
:
actions/checkout@v2
-
uses
:
actions/checkout@v2
...
...
.github/workflows/tests.yaml
View file @
53ee09c0
...
@@ -17,7 +17,7 @@ concurrency:
...
@@ -17,7 +17,7 @@ concurrency:
jobs
:
jobs
:
run_tests
:
run_tests
:
runs-on
:
ubuntu-
20.04
runs-on
:
ubuntu-
latest
env
:
env
:
SCCACHE_GHA_ENABLED
:
"
on"
SCCACHE_GHA_ENABLED
:
"
on"
...
...
Dockerfile
View file @
53ee09c0
# Rust builder
FROM
lukemathwalker/cargo-chef:latest-rust-1.67 AS chef
FROM
lukemathwalker/cargo-chef:latest-rust-1.67 AS chef
WORKDIR
/usr/src
WORKDIR
/usr/src
...
@@ -27,51 +28,135 @@ COPY router router
...
@@ -27,51 +28,135 @@ COPY router router
COPY
launcher launcher
COPY
launcher launcher
RUN
cargo build
--release
RUN
cargo build
--release
FROM
nvidia/cuda:11.8.0-devel-ubuntu22.04 as base
# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
FROM
ubuntu:22.04 as pytorch-install
ARG
PYTORCH_VERSION=2.0.0
ARG
PYTHON_VERSION=3.9
ARG
CUDA_VERSION=11.8
ARG
MAMBA_VERSION=23.1.0-1
ARG
CUDA_CHANNEL=nvidia
ARG
INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG
TARGETPLATFORM
RUN
apt-get update
&&
DEBIAN_FRONTEND
=
noninteractive apt-get
install
-y
--no-install-recommends
\
build-essential
\
ca-certificates
\
ccache
\
cmake
\
curl
\
git
&&
\
rm
-rf
/var/lib/apt/lists/
*
RUN
/usr/sbin/update-ccache-symlinks
&&
\
mkdir
/opt/ccache
&&
\
ccache
--set-config
=
cache_dir
=
/opt/ccache
ENV
PATH /opt/conda/bin:$PATH
# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case
${
TARGETPLATFORM
}
in
\
"linux/arm64"
)
MAMBA_ARCH
=
aarch64
;;
\
*
)
MAMBA_ARCH
=
x86_64
;;
\
esac
&&
\
curl
-fsSL
-v
-o
~/mambaforge.sh
-O
"https://github.com/conda-forge/miniforge/releases/download/
${
MAMBA_VERSION
}
/Mambaforge-
${
MAMBA_VERSION
}
-Linux-
${
MAMBA_ARCH
}
.sh"
RUN
chmod
+x ~/mambaforge.sh
&&
\
bash ~/mambaforge.sh
-b
-p
/opt/conda
&&
\
rm
~/mambaforge.sh
# Install pytorch
# On arm64 we exit with an error code
RUN case
${
TARGETPLATFORM
}
in
\
"linux/arm64"
)
exit
1
;;
\
*
)
/opt/conda/bin/conda update
-y
conda
&&
\
/opt/conda/bin/conda
install
-c
"
${
INSTALL_CHANNEL
}
"
-c
"
${
CUDA_CHANNEL
}
"
-y
"python=
${
PYTHON_VERSION
}
"
pytorch
==
$PYTORCH_VERSION
"pytorch-cuda=
$(
echo
$CUDA_VERSION
|
cut
-d
'.'
-f
1-2
)
"
;;
\
esac
&&
\
/opt/conda/bin/conda clean
-ya
# CUDA kernels builder image
FROM
pytorch-install as kernel-builder
RUN
apt-get update
&&
DEBIAN_FRONTEND
=
noninteractive apt-get
install
-y
--no-install-recommends
\
ninja-build
\
&&
rm
-rf
/var/lib/apt/lists/
*
RUN
/opt/conda/bin/conda
install
-c
"nvidia/label/cuda-11.8.0"
cuda
==
11.8
&&
\
/opt/conda/bin/conda clean
-ya
# Build Flash Attention CUDA kernels
FROM
kernel-builder as flash-att-builder
ENV
LANG=C.UTF-8 \
WORKDIR
/usr/src
LC_ALL=C.UTF-8 \
DEBIAN_FRONTEND=noninteractive \
COPY
server/Makefile-flash-att Makefile
HUGGINGFACE_HUB_CACHE=/data \
# Build specific version of flash attention
RUN
make build-flash-attention
# Build Transformers CUDA kernels
FROM
kernel-builder as transformers-builder
WORKDIR
/usr/src
COPY
server/Makefile-transformers Makefile
# Build specific version of transformers
RUN
BUILD_EXTENSIONS
=
"True"
make build-transformers
# Text Generation Inference base image
FROM
ubuntu:22.04 as base
ARG
TARGETPLATFORM
ARG
PYTORCH_VERSION=2.0.0
ARG
CUDA_VERSION=11.8
# Conda and CUDA env
ENV
PATH=/opt/conda/bin:$PATH \
NVIDIA_VISIBLE_DEVICES=all \
NVIDIA_DRIVER_CAPABILITIES=compute,utility \
LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
# Text Generation Inference base env
ENV
HUGGINGFACE_HUB_CACHE=/data \
HF_HUB_ENABLE_HF_TRANSFER=1 \
HF_HUB_ENABLE_HF_TRANSFER=1 \
MODEL_ID=bigscience/bloom-560m \
MODEL_ID=bigscience/bloom-560m \
QUANTIZE=false \
QUANTIZE=false \
NUM_SHARD=1 \
NUM_SHARD=1 \
PORT=80 \
PORT=80
CUDA_HOME=/usr/local/cuda \
LD_LIBRARY_PATH="/opt/miniconda/envs/text-generation/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH" \
CONDA_DEFAULT_ENV=text-generation \
PATH=$PATH:/opt/miniconda/envs/text-generation/bin:/opt/miniconda/bin:/usr/local/cuda/bin
RUN
apt-get update
&&
apt-get
install
-y
git curl libssl-dev
&&
rm
-rf
/var/lib/apt/lists/
*
LABEL
com.nvidia.volumes.needed="nvidia_driver"
RUN
cd
~
&&
\
curl
-L
-O
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
&&
\
chmod
+x Miniconda3-latest-Linux-x86_64.sh
&&
\
bash ./Miniconda3-latest-Linux-x86_64.sh
-bf
-p
/opt/miniconda
&&
\
conda create
-n
text-generation
python
=
3.9
-y
WORKDIR
/usr/src
WORKDIR
/usr/src
# Install torch
RUN
apt-get update
&&
DEBIAN_FRONTEND
=
noninteractive apt-get
install
-y
--no-install-recommends
\
RUN
pip
install
torch
--extra-index-url
https://download.pytorch.org/whl/cu118
--no-cache-dir
libssl-dev
\
make
\
&&
rm
-rf
/var/lib/apt/lists/
*
# Install specific version of flash attention
# Copy conda with PyTorch installed
COPY
server/Makefile-flash-att server/Makefile
COPY
--from=pytorch-install /opt/conda /opt/conda
RUN
cd
server
&&
make install-flash-attention
# Install specific version of transformers
# Copy build artifacts from flash attention builder
COPY
server/Makefile-transformers server/Makefile
COPY
--from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
RUN
cd
server
&&
BUILD_EXTENSIONS
=
"True"
make install-transformers
COPY
--from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
COPY
--from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
COPY
server/Makefile server/Makefile
# Copy build artifacts from transformers builder
COPY
--from=transformers-builder /usr/src/transformers /usr/src/transformers
COPY
--from=transformers-builder /usr/src/transformers/build/lib.linux-x86_64-cpython-39/transformers /usr/src/transformers/src/transformers
# Install transformers dependencies
RUN
cd
/usr/src/transformers
&&
pip
install
-e
.
--no-cache-dir
# Install server
# Install server
COPY
proto proto
COPY
proto proto
COPY
server server
COPY
server server
COPY
server/Makefile server/Makefile
RUN
cd
server
&&
\
RUN
cd
server
&&
\
make gen-server
&&
\
make gen-server
&&
\
/opt/miniconda/envs/text-generation/bin/
pip
install
".[bnb]"
--no-cache-dir
pip
install
".[bnb]"
--no-cache-dir
# Install router
# Install router
COPY
--from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
COPY
--from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
...
@@ -86,7 +171,7 @@ RUN chmod +x entrypoint.sh
...
@@ -86,7 +171,7 @@ RUN chmod +x entrypoint.sh
ENTRYPOINT
["./entrypoint.sh"]
ENTRYPOINT
["./entrypoint.sh"]
#
Orig
inal image
#
F
inal image
FROM
base
FROM
base
ENTRYPOINT
["text-generation-launcher"]
ENTRYPOINT
["text-generation-launcher"]
...
...
clients/python/tests/test_client.py
View file @
53ee09c0
...
@@ -17,7 +17,7 @@ def test_generate(flan_t5_xxl_url, hf_headers):
...
@@ -17,7 +17,7 @@ def test_generate(flan_t5_xxl_url, hf_headers):
assert
response
.
details
.
prefill
[
0
]
==
PrefillToken
(
id
=
0
,
text
=
"<pad>"
,
logprob
=
None
)
assert
response
.
details
.
prefill
[
0
]
==
PrefillToken
(
id
=
0
,
text
=
"<pad>"
,
logprob
=
None
)
assert
len
(
response
.
details
.
tokens
)
==
1
assert
len
(
response
.
details
.
tokens
)
==
1
assert
response
.
details
.
tokens
[
0
]
==
Token
(
assert
response
.
details
.
tokens
[
0
]
==
Token
(
id
=
3
,
text
=
"
"
,
logprob
=-
1.984375
,
special
=
False
id
=
3
,
text
=
""
,
logprob
=-
1.984375
,
special
=
False
)
)
...
@@ -83,7 +83,7 @@ async def test_generate_async(flan_t5_xxl_url, hf_headers):
...
@@ -83,7 +83,7 @@ async def test_generate_async(flan_t5_xxl_url, hf_headers):
assert
response
.
details
.
prefill
[
0
]
==
PrefillToken
(
id
=
0
,
text
=
"<pad>"
,
logprob
=
None
)
assert
response
.
details
.
prefill
[
0
]
==
PrefillToken
(
id
=
0
,
text
=
"<pad>"
,
logprob
=
None
)
assert
len
(
response
.
details
.
tokens
)
==
1
assert
len
(
response
.
details
.
tokens
)
==
1
assert
response
.
details
.
tokens
[
0
]
==
Token
(
assert
response
.
details
.
tokens
[
0
]
==
Token
(
id
=
3
,
text
=
"
"
,
logprob
=-
1.984375
,
special
=
False
id
=
3
,
text
=
""
,
logprob
=-
1.984375
,
special
=
False
)
)
...
...
server/Makefile-flash-att
View file @
53ee09c0
flash_att_commit := d478eeec8f16c7939c54e4617dbd36f59b8eeed7
flash_att_commit := d478eeec8f16c7939c54e4617dbd36f59b8eeed7
install-
flash-attention:
flash-attention:
# Install specific version of
flash attention
# Clone
flash attention
pip install packaging
pip install packaging
pip uninstall flash_attn rotary_emb dropout_layer_norm -y || true
rm -rf flash-attention || true
git clone https://github.com/HazyResearch/flash-attention.git
git clone https://github.com/HazyResearch/flash-attention.git
cd flash-attention && git checkout $(flash_att_commit)
build-flash-attention: flash-attention
cd flash-attention && git fetch && git checkout $(flash_att_commit)
cd flash-attention && python setup.py build
cd flash-attention/csrc/rotary && python setup.py build
cd flash-attention/csrc/layer_norm && python setup.py build
install-flash-attention: build-flash-attention
pip uninstall flash_attn rotary_emb dropout_layer_norm -y || true
cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
\ No newline at end of file
server/Makefile-transformers
View file @
53ee09c0
transformers_commit := b8d969ff47c6a9d40538a6ea33df021953363afc
transformers_commit := b8d969ff47c6a9d40538a6ea33df021953363afc
install-
transformers:
transformers:
#
Install specific version
of transformers with custom
cuda
kernels
#
Clone fork
of transformers with custom
CUDA
kernels
and sharding logic
pip install --upgrade setuptools
pip install --upgrade setuptools
pip uninstall transformers -y || true
rm -rf transformers || true
git clone https://github.com/OlivierDehaene/transformers.git
git clone https://github.com/OlivierDehaene/transformers.git
cd transformers && git checkout $(transformers_commit)
build-transformers: transformers
cd transformers && git fetch && git checkout $(transformers_commit) && python setup.py build
install-transformers: build-transformers
pip uninstall transformers -y || true
cd transformers && python setup.py install
cd transformers && python setup.py install
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment