Making `make install` work better by default. (#2004)

# What does this PR do? Making `make install` a much better sane default to start local dev environments.   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.

Making `make install` work better by default. (#2004)
# What does this PR do? Making `make install` a much better sane default to start local dev environments.   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.
8390e251 · Nicolas Patry · GitHub · d14eaaca · 8390e251 · 8390e251
Unverified Commit 8390e251 authored Jun 04, 2024 by Nicolas Patry Committed by GitHub Jun 04, 2024
11 changed files
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -68,7 +68,7 @@ jobs:
            ~/.cargo/git
      - name: Install
        run: |
-          make install
+          make install-cpu
      - name: Run server tests
        run: |
          pip install pytest

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,10 @@ tokenizers = { version = "0.19.1", features = ["http"] }
 hf-hub = { version = "0.3.1", features = ["tokio"] }
 [profile.release]
+incremental = true
+[profile.release-binary]
+inherits = "release"
 debug = 1
 incremental = true
 lto = "fat"

--- a/Dockerfile
+++ b/Dockerfile
@@ -193,7 +193,7 @@ COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib
 COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 # Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
 # Copy build artifacts from custom kernels builder
 COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

--- a/Makefile
+++ b/Makefile
 install-server:
 	cd server && make install
-install-custom-kernels:
+install-server-cpu:
-	if [ "$$BUILD_EXTENSIONS" = "True" ]; then cd server/custom_kernels && python setup.py install; else echo "Custom kernels are disabled, you need to set the BUILD_EXTENSIONS environment variable to 'True' in order to build them. (Please read the docs, kernels might not work on all hardware)"; fi
+	cd server && make install-server
-install-integration-tests:
-	cd integration-tests && pip install -r requirements.txt
-	cd clients/python && pip install .
 install-router:
 	cd router && cargo install --path .
@@ -17,7 +13,10 @@ install-launcher:
 install-benchmark:
 	cd benchmark && cargo install --path .
-install: install-server install-router install-launcher install-custom-kernels
+install: install-server install-router install-launcher
+install-cpu: install-server-cpu install-router install-launcher
 server-dev:
 	cd server && make run-dev
@@ -28,6 +27,10 @@ router-dev:
 rust-tests: install-router install-launcher
 	cargo test
+install-integration-tests:
+	cd integration-tests && pip install -r requirements.txt
+	cd clients/python && pip install .
 integration-tests: install-integration-tests
 	pytest -s -vv -m "not private" integration-tests

--- a/router/client/build.rs
+++ b/router/client/build.rs
@@ -13,7 +13,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
        .out_dir("src/v2/pb")
        .include_file("mod.rs")
        .compile_with_config(config, &["../../proto/generate.proto"], &["../../proto"])
-        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
+        .map_err(|e| match e.kind(){
+            std::io::ErrorKind::NotFound => {panic!("`protoc` not found, install libprotoc")},
+            std::io::ErrorKind::Other => {panic!("`protoc` version unsupported, upgrade protoc: https://github.com/protocolbuffers/protobuf/releases")},
+            e => {e}
+        }).unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
    fs::create_dir_all("src/v3/pb").unwrap_or(());
    let mut config = prost_build::Config::new();

--- a/server/Makefile
+++ b/server/Makefile
@@ -10,18 +10,26 @@ unit-tests:
 gen-server:
 	# Compile protos
-	pip install grpcio-tools==1.51.1 mypy-protobuf==3.4.0 'types-protobuf>=3.20.4' --no-cache-dir
+	pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir
 	mkdir text_generation_server/pb || true
 	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
 		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
 	find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
 	touch text_generation_server/pb/__init__.py
-install: gen-server
+install-server: gen-server
 	pip install pip --upgrade
 	pip install -r requirements_cuda.txt
 	pip install -e ".[bnb, accelerate, quantize, peft, outlines]"
+install: install-cuda
+	echo "Installed server"
+install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention
+install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
 run-dev:
 	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded

--- a/server/Makefile-flash-att
+++ b/server/Makefile-flash-att
 flash_att_commit := 3a9bfd076f98746c73362328958dbc68d145fbec
-flash-attention:
+build-flash-attention:
-    # Clone flash attention
+	if [ ! -d 'flash-attention' ]; then \
-	pip install -U packaging ninja  --no-cache-dir
+		pip install -U packaging ninja  --no-cache-dir && \
-	git clone https://github.com/HazyResearch/flash-attention.git
+		git clone https://github.com/HazyResearch/flash-attention.git && \
+		cd flash-attention && git fetch && git checkout $(flash_att_commit) && \
-build-flash-attention: flash-attention
+		MAX_JOBS=8 python setup.py build && cd csrc/layer_norm && python setup.py build && cd ../rotary && python setup.py build; \
-	cd flash-attention && git fetch && git checkout $(flash_att_commit)
+	fi
-	cd flash-attention && python setup.py build
-	cd flash-attention/csrc/rotary && python setup.py build
-	cd flash-attention/csrc/layer_norm && python setup.py build
 install-flash-attention: build-flash-attention
-	pip uninstall flash_attn rotary_emb dropout_layer_norm -y || true
+	if [ ! -d 'flash-attention' ]; then \
-	cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
+		cd flash-attntion && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install; \
+	fi
--- a/server/Makefile-flash-att-v2
+++ b/server/Makefile-flash-att-v2
-flash_att_v2_commit_cuda := v2.5.8
+flash_att_v2_commit_cuda := v2.5.9.post1
 flash_att_v2_commit_rocm := 2554f490101742ccdc56620a938f847f61754be6
+build-flash-attention-v2-cuda:
+	pip install -U packaging wheel
+	pip install flash-attn==$(flash_att_v2_commit_cuda)
-flash-attention-v2-cuda:
+install-flash-attention-v2-cuda:
-  # Clone flash attention
+	pip install -U packaging wheel
-	pip install -U packaging ninja  --no-cache-dir
+	pip install flash-attn==$(flash_att_v2_commit_cuda)
-	git clone https://github.com/Dao-AILab/flash-attention.git flash-attention-v2
-build-flash-attention-v2-cuda: flash-attention-v2-cuda
+build-flash-attention-v2-rocm:
-	cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_cuda)
+	if [ ! -d 'flash-attention-v2' ]; then \
-	cd flash-attention-v2 && git submodule update --init --recursive
+		pip install -U packaging ninja  --no-cache-dir && \
-	cd flash-attention-v2 && python setup.py build
+		git clone https://github.com/ROCm/flash-attention.git flash-attention-v2 && \
+		cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm) && \
-install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
+		git submodule update --init --recursive && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build; \
-	cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install
+	fi
-flash-attention-v2-rocm:
-  # Clone flash attention
-	pip install -U packaging ninja  --no-cache-dir
-	git clone https://github.com/ROCm/flash-attention.git flash-attention-v2
-build-flash-attention-v2-rocm: flash-attention-v2-rocm
-	cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm)
-	cd flash-attention-v2 && git submodule update --init --recursive
-	cd flash-attention-v2 && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build
 install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
-	cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install
+	if [ ! -d 'flash-attention-v2' ]; then \
+		cd flash-attention-v2 &&  \
+		GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install; \
+	fi
--- a/server/Makefile-vllm
+++ b/server/Makefile-vllm
-vllm-cuda:
+build-vllm-cuda:
-    # Clone vllm
+	if [ ! -d 'vllm' ]; then \
-	pip install -U ninja packaging --no-cache-dir
+		pip install -U ninja packaging --no-cache-dir && \
-	git clone https://github.com/Narsil/vllm.git vllm
+		git clone https://github.com/Narsil/vllm.git vllm  &&\
+		cd vllm  && \
-build-vllm-cuda: vllm-cuda
+		git fetch && git checkout b5dfc61db88a81069e45b44f7cc99bd9e62a60fa &&\
-	cd vllm && git fetch && git checkout b5dfc61db88a81069e45b44f7cc99bd9e62a60fa
+		python setup.py build; \
-	cd vllm && python setup.py build
+	fi
 install-vllm-cuda: build-vllm-cuda
-	pip uninstall vllm -y || true
+	if [ ! -d 'vllm' ]; then \
-	cd vllm && python setup.py install
+		cd vllm && pip install -e .; \
+	fi
-vllm-rocm:
-    # Clone vllm
-	pip install -U ninja packaging --no-cache-dir
-	git clone https://github.com/fxmarty/rocm-vllm.git vllm
-build-vllm-rocm: vllm-rocm
+build-vllm-rocm:
-	cd vllm && git fetch && git checkout ca6913b3c2ffacdcb7d15e914dc34adbc6c89479
+	if [ ! -d 'vllm' ]; then \
-	cd vllm && PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install
+		pip install -U ninja packaging --no-cache-dir && \
+		git clone https://github.com/fxmarty/rocm-vllm.git vllm && \
+		cd vllm && git fetch && git checkout ca6913b3c2ffacdcb7d15e914dc34adbc6c89479 &&  \
+		PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build; \
+	fi
 install-vllm-rocm: build-vllm-rocm
-	pip uninstall vllm -y || true
+	if [ ! -d 'vllm' ]; then \
-	cd vllm && python setup.py install
+		cd vllm && \
+		PYTORCH_ROCM_ARCH="gfx90a;gfx942" pip install -e .; \
+	fi
--- a/server/poetry.lock
+++ b/server/poetry.lock
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -9,7 +9,7 @@ text-generation-server = 'text_generation_server.cli:app'
 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
-protobuf = "^4.21.7"
+protobuf = "^4.25.3"
 grpcio = "^1.51.1"
 grpcio-status = "^1.51.1"
 grpcio-reflection = "^1.51.1"
@@ -19,9 +19,9 @@ accelerate = { version = "^0.29.1", optional = true }
 bitsandbytes = { version = "^0.43.0", optional = true }
 safetensors = "^0.4"
 loguru = "^0.6.0"
-opentelemetry-api = "^1.15.0"
+opentelemetry-api = "^1.25.0"
-opentelemetry-exporter-otlp = "^1.15.0"
+opentelemetry-exporter-otlp = "^1.25.0"
-opentelemetry-instrumentation-grpc = "^0.36b0"
+opentelemetry-instrumentation-grpc = "^0.46b0"
 hf-transfer = "^0.1.2"
 sentencepiece = "^0.1.97"
 tokenizers = "^0.19.1"
@@ -34,7 +34,7 @@ peft = { version = "^0.10", optional = true }
 torch = { version = "^2.3.0", optional = true }
 scipy = "^1.11.1"
 pillow = "^10.0.0"
-outlines= { version = "^0.0.36", optional = true }
+outlines= { version = "^0.0.34", optional = true }
 prometheus-client = "^0.20.0"
 py-cpuinfo = "^9.0.0"