name: test on: pull_request: paths: - ".github/scripts/test_triton_server.py" - ".github/workflows/test.yml" - "cmake/**" - "src/**" - "3rdparty/**" - "lmdeploy/**" - "requirements/**" - "requirements.txt" - "CMakeLists.txt" - "setup.py" push: branches: - main paths: - "lmdeploy/version.py" tags: - "v*.*.*" workflow_dispatch: inputs: markers: required: false description: 'Tested markers. eg: "-m internlm_chat_7b"' type: string default: '' env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai jobs: test_functions: runs-on: [self-hosted, linux-a100] timeout-minutes: 4320 # 72hours environment: 'prod' env: REPORT_DIR: /nvme/qa_test_models/test-reports container: image: nvcr.io/nvidia/tritonserver:22.12-py3 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages - /nvme/qa_test_models:/nvme/qa_test_models - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - name: Setup systems run: | rm /etc/apt/sources.list.d/cuda*.list apt-get update && apt-get install -y --no-install-recommends rapidjson-dev \ libgoogle-glog-dev libgl1 openjdk-8-jre-headless dpkg -i /root/packages/allure_2.24.1-1_all.deb rm -rf /var/lib/apt/lists/* - name: Clone repository uses: actions/checkout@v2 - name: Install pytorch run: | python3 -m pip cache dir python3 -m pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 - name: Build lmdeploy run: | python3 -m pip install cmake python3 -m pip install -r requirements/build.txt # use cached build cp -r ../../build build cd build cmake .. \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ -DCMAKE_INSTALL_PREFIX=./install \ -DBUILD_PY_FFI=ON \ -DBUILD_MULTI_GPU=ON \ -DCMAKE_CUDA_FLAGS="-lineinfo" \ -DUSE_NVTX=ON \ -DSM=80 \ -DCMAKE_CUDA_ARCHITECTURES=80 \ -DBUILD_TEST=OFF make -j$(nproc) && make install - name: Install lmdeploy run: | python3 -m pip install packaging protobuf transformers_stream_generator python3 -m pip install -r requirements.txt -r requirements/test.txt python3 -m pip install . - name: Check env run: | python3 -m pip list lmdeploy check_env - name: Test lmdeploy run: | echo "TODO: awaiting PR of adding autotest" # pytest autotest ${{github.event.inputs.markers}} --alluredir=allure-results --clean-alluredir - name: Generate reports if: always() run: | if test -D "allure-results"; then export date_today="$(date +'%Y%m%d-%H%M%S')" export report_dir="$REPORT_DIR/$date_today" echo "Save report to $ALLURE_DIR" allure generate -c -o $report_dir fi - name: Clear workfile if: always() run: | export workdir=$(pwd) cd .. rm -rf $workdir mkdir $workdir chmod -R 777 $workdir test_triton: runs-on: [self-hosted, linux-a100] timeout-minutes: 4320 # 72hours environment: 'prod' env: HF_MODEL: /nvme/qa_test_models/internlm-chat-20b WORKDIR: /nvme/qa_test_models/triton_workspace TB_MODEL: internlm-chat-20b-fp16-tp2 GRPC_PORT: 33337 steps: - name: Clone repository uses: actions/checkout@v2 - name: Create test container run: | export CONTAINER_ID=$(docker create \ --rm \ --gpus='"device=0,1"' \ --shm-size 16g \ --cap-add=SYS_PTRACE \ --cap-add=SYS_ADMIN \ --security-opt seccomp=unconfined \ --name lmdeploy-ci-triton \ --workdir /__w/lmdeploy/lmdeploy \ --env PIP_CACHE_DIR=/root/.cache/pip \ --env NCCL_LAUNCH_MODE=GROUP \ -v $(pwd)/../../:/__w \ -v ${HF_MODEL}:/root/workspace/hf_model \ -v ${WORKDIR}:/root/workspace/workdir \ -v ${HOST_PIP_CACHE_DIR}:/root/.cache/pip \ -v ${HOST_LOCALTIME}:/etc/localtime:ro \ openmmlab/lmdeploy:latest tail -f /dev/null \ ) docker start $CONTAINER_ID echo "CONTAINER_ID=$CONTAINER_ID" echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV - name: Build lmdeploy from source run: | docker exec $CONTAINER_ID cp -r ../../build build docker exec --workdir /__w/lmdeploy/lmdeploy/build \ --env http_proxy=${{secrets.PROXY}} \ --env https_proxy=${{secrets.PROXY}} \ $CONTAINER_ID cmake .. \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ -DCMAKE_INSTALL_PREFIX=./install \ -DBUILD_PY_FFI=ON \ -DBUILD_MULTI_GPU=ON \ -DCMAKE_CUDA_FLAGS="-lineinfo" \ -DUSE_NVTX=ON \ -DSM=80 \ -DCMAKE_CUDA_ARCHITECTURES=80 \ -DBUILD_TEST=OFF docker exec --workdir /__w/lmdeploy/lmdeploy/build $CONTAINER_ID make -j$(nproc) docker exec --workdir /__w/lmdeploy/lmdeploy/build $CONTAINER_ID make install - name: Install lmdeploy run: | docker exec \ --env http_proxy=${{secrets.PROXY}} \ --env https_proxy=${{secrets.PROXY}} \ $CONTAINER_ID python3 -m pip install tritonclient[grpc] docker exec \ --env http_proxy=${{secrets.PROXY}} \ --env https_proxy=${{secrets.PROXY}} \ $CONTAINER_ID python3 -m pip install -r requirements/test.txt docker exec $CONTAINER_ID python3 -m pip install . # docker exec $CONTAINER_ID check_env - name: Convert to turbomind model run: | docker exec $CONTAINER_ID \ lmdeploy convert \ --model-name internlm-chat-20b \ --model-path /root/workspace/hf_model \ --tp 2 \ --dst-path /root/workspace/workdir/${TB_MODEL} - name: Start triton server service run: | docker exec --detach $CONTAINER_ID \ tritonserver \ --model-repository=/root/workspace/workdir/${TB_MODEL}/model_repository \ --allow-http=0 \ --allow-grpc=1 \ --grpc-port=${GRPC_PORT} \ --log-verbose=0 \ --allow-metrics=1 # wait for triton server to fully start up sleep 180s - name: Test triton server run: | docker exec \ --env no_proxy=localhost,127.0.0.1 \ $CONTAINER_ID python3 .github/scripts/test_triton_server.py --port ${GRPC_PORT} - name: Clear workfile if: always() run: | export workdir=$(pwd) docker exec --workdir /__w/lmdeploy $CONTAINER_ID rm -rf lmdeploy mkdir $workdir chmod -R 777 $workdir docker exec --workdir /__w/lmdeploy $CONTAINER_ID rm -rf /root/workspace/workdir/${TB_MODEL} docker stop $CONTAINER_ID