"ssh:/git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "0f8a98612d2879807039a5c679b7da42500df8d6"
Commit 1591c68f authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.4.2

parents 09bcf00b c7f2cf2b
import os
import zipfile
MAX_SIZE_MB = 100
def print_top_10_largest_files(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
file_sizes.sort(key=lambda x: x[1], reverse=True)
for f, size in file_sizes[:10]:
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
def check_wheel_size(directory):
for root, _, files in os.walk(directory):
for f in files:
if f.endswith(".whl"):
wheel_path = os.path.join(root, f)
wheel_size = os.path.getsize(wheel_path)
wheel_size_mb = wheel_size / (1024 * 1024)
if wheel_size_mb > MAX_SIZE_MB:
print(
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
print_top_10_largest_files(wheel_path)
return 1
else:
print(f"Wheel {wheel_path} is within the allowed size "
f"({wheel_size_mb} MB).")
return 0
if __name__ == "__main__":
import sys
sys.exit(check_wheel_size(sys.argv[1]))
# This script build the ROCm docker image and run the API server inside the container. # This script build the ROCm docker image and runs test inside it.
# It serves a sanity check for compilation and basic model usage.
set -ex set -ex
# Print ROCm version # Print ROCm version
echo "--- ROCm info"
rocminfo rocminfo
# Try building the docker image echo "--- Resetting GPUs"
docker build -t rocm -f Dockerfile.rocm .
# Setup cleanup echo "reset" > /opt/amdgpu/etc/gpu_state
remove_docker_container() { docker rm -f rocm || true; }
trap remove_docker_container EXIT while true; do
remove_docker_container sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
# Run the image echo "GPUs state is \"clean\""
docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server & break
# Wait for the server to start
wait_for_server_to_start() {
timeout=300
counter=0
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi fi
done done
echo "--- Building container"
sha=$(git rev-parse --short HEAD)
container_name=rocm_${sha}
docker build \
-t ${container_name} \
-f Dockerfile.rocm \
--progress plain \
.
remove_docker_container() {
docker rm -f ${container_name} || docker image rm -f ${container_name} || true
} }
wait_for_server_to_start trap remove_docker_container EXIT
echo "--- Running container"
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--rm \
-e HF_TOKEN \
--name ${container_name} \
${container_name} \
/bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")
# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'
...@@ -53,6 +53,11 @@ echo '```' >> benchmark_results.md ...@@ -53,6 +53,11 @@ echo '```' >> benchmark_results.md
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
echo '```' >> benchmark_results.md echo '```' >> benchmark_results.md
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
exit 0
fi
# upload the results to buildkite # upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
......
...@@ -4,6 +4,20 @@ set -e ...@@ -4,6 +4,20 @@ set -e
# Try building the docker image # Try building the docker image
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
# prune old image and containers to save disk space, and only once a day
# by using a timestamp file in tmp.
if [ -f /tmp/neuron-docker-build-timestamp ]; then
last_build=$(cat /tmp/neuron-docker-build-timestamp)
current_time=$(date +%s)
if [ $((current_time - last_build)) -gt 86400 ]; then
docker system prune -f
echo $current_time > /tmp/neuron-docker-build-timestamp
fi
else
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
fi
docker build -t neuron -f Dockerfile.neuron . docker build -t neuron -f Dockerfile.neuron .
# Setup cleanup # Setup cleanup
......
...@@ -17,27 +17,38 @@ steps: ...@@ -17,27 +17,38 @@ steps:
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test - label: Core Test
mirror_hardwares: [amd]
command: pytest -v -s core command: pytest -v -s core
- label: Distributed Comm Ops Test - label: Distributed Comm Ops Test
command: pytest -v -s test_comm_ops.py command: pytest -v -s test_comm_ops.py
working_dir: "/vllm-workspace/tests/distributed" working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now. num_gpus: 2
- label: Distributed Tests - label: Distributed Tests
working_dir: "/vllm-workspace/tests/distributed" working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now. num_gpus: 2 # only support 1 or 2 for now.
mirror_hardwares: [amd]
commands: commands:
- pytest -v -s test_pynccl.py
- pytest -v -s test_pynccl_library.py - pytest -v -s test_pynccl_library.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
- label: Distributed Tests (Multiple Groups)
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 4
commands:
- pytest -v -s test_pynccl.py
- label: Engine Test - label: Engine Test
mirror_hardwares: [amd]
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
- label: Entrypoints Test - label: Entrypoints Test
...@@ -48,6 +59,7 @@ steps: ...@@ -48,6 +59,7 @@ steps:
- label: Examples Test - label: Examples Test
working_dir: "/vllm-workspace/examples" working_dir: "/vllm-workspace/examples"
mirror_hardwares: [amd]
commands: commands:
# install aws cli for llava_example.py # install aws cli for llava_example.py
- pip install awscli - pip install awscli
...@@ -61,16 +73,19 @@ steps: ...@@ -61,16 +73,19 @@ steps:
parallelism: 4 parallelism: 4
- label: Models Test - label: Models Test
mirror_hardwares: [amd]
commands: commands:
- bash ../.buildkite/download-images.sh - bash ../.buildkite/download-images.sh
- pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
- label: Llava Test - label: Llava Test
mirror_hardwares: [amd]
commands: commands:
- bash ../.buildkite/download-images.sh - bash ../.buildkite/download-images.sh
- pytest -v -s models/test_llava.py - pytest -v -s models/test_llava.py
- label: Prefix Caching Test - label: Prefix Caching Test
mirror_hardwares: [amd]
commands: commands:
- pytest -v -s prefix_caching - pytest -v -s prefix_caching
...@@ -78,12 +93,15 @@ steps: ...@@ -78,12 +93,15 @@ steps:
command: pytest -v -s samplers command: pytest -v -s samplers
- label: LogitsProcessor Test - label: LogitsProcessor Test
mirror_hardwares: [amd]
command: pytest -v -s test_logits_processor.py command: pytest -v -s test_logits_processor.py
- label: Worker Test - label: Worker Test
mirror_hardwares: [amd]
command: pytest -v -s worker command: pytest -v -s worker
- label: Speculative decoding tests - label: Speculative decoding tests
mirror_hardwares: [amd]
command: pytest -v -s spec_decode command: pytest -v -s spec_decode
- label: LoRA Test %N - label: LoRA Test %N
...@@ -101,6 +119,7 @@ steps: ...@@ -101,6 +119,7 @@ steps:
- label: Benchmarks - label: Benchmarks
working_dir: "/vllm-workspace/.buildkite" working_dir: "/vllm-workspace/.buildkite"
mirror_hardwares: [amd]
commands: commands:
- pip install aiohttp - pip install aiohttp
- bash run-benchmarks.sh - bash run-benchmarks.sh
......
...@@ -16,17 +16,29 @@ steps: ...@@ -16,17 +16,29 @@ steps:
limit: 5 limit: 5
- wait - wait
- label: "AMD Test" - group: "AMD Tests"
agents: depends_on: ~
queue: amd steps:
command: bash .buildkite/run-amd-test.sh {% for step in steps %}
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
- label: "AMD: {{ step.label }}"
agents:
queue: amd
command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
env:
DOCKER_BUILDKIT: "1"
{% endif %}
{% endfor %}
- label: "Neuron Test" - label: "Neuron Test"
depends_on: ~
agents: agents:
queue: neuron queue: neuron
command: bash .buildkite/run-neuron-test.sh command: bash .buildkite/run-neuron-test.sh
soft_fail: true
- label: "CPU Test" - label: "Intel Test"
depends_on: ~
command: bash .buildkite/run-cpu-test.sh command: bash .buildkite/run-cpu-test.sh
{% for step in steps %} {% for step in steps %}
...@@ -44,6 +56,9 @@ steps: ...@@ -44,6 +56,9 @@ steps:
plugins: plugins:
- kubernetes: - kubernetes:
podSpec: podSpec:
{% if step.num_gpus %}
priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
{% endif %}
volumes: volumes:
- name: dshm - name: dshm
emptyDir: emptyDir:
......
name: 💬 Request for comments (RFC).
description: Ask for feedback on major architectural changes or design choices.
title: "[RFC]: "
labels: ["RFC"]
body:
- type: markdown
attributes:
value: >
#### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
- type: textarea
attributes:
label: Motivation.
description: >
The motivation of the RFC.
validations:
required: true
- type: textarea
attributes:
label: Proposed Change.
description: >
The proposed change of the RFC.
validations:
required: true
- type: textarea
attributes:
label: Feedback Period.
description: >
The feedback period of the RFC. Usually at least one week.
validations:
required: false
- type: textarea
attributes:
label: CC List.
description: >
The list of people you want to CC.
validations:
required: false
- type: textarea
attributes:
label: Any Other Things.
description: >
Any other things you would like to mention.
validations:
required: false
- type: markdown
attributes:
value: >
Thanks for contributing 🎉!
...@@ -33,8 +33,7 @@ jobs: ...@@ -33,8 +33,7 @@ jobs:
- name: Mypy - name: Mypy
run: | run: |
mypy vllm/attention --config-file pyproject.toml mypy vllm/attention --config-file pyproject.toml
# TODO(sang): Fix nested dir mypy vllm/core --config-file pyproject.toml
mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml mypy vllm/distributed --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml mypy vllm/executor --config-file pyproject.toml
...@@ -44,8 +43,8 @@ jobs: ...@@ -44,8 +43,8 @@ jobs:
mypy vllm/engine --config-file pyproject.toml mypy vllm/engine --config-file pyproject.toml
mypy vllm/worker --config-file pyproject.toml mypy vllm/worker --config-file pyproject.toml
mypy vllm/spec_decode --config-file pyproject.toml mypy vllm/spec_decode --config-file pyproject.toml
# TODO(sang): Fix nested dir mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/model_executor/*.py --config-file pyproject.toml mypy vllm/lora --config-file pyproject.toml
# TODO(sang): Fix nested dir mypy vllm/logging --config-file pyproject.toml
# mypy vllm/lora/*.py --config-file pyproject.toml mypy vllm/model_executor --config-file pyproject.toml
...@@ -49,7 +49,7 @@ jobs: ...@@ -49,7 +49,7 @@ jobs:
matrix: matrix:
os: ['ubuntu-20.04'] os: ['ubuntu-20.04']
python-version: ['3.8', '3.9', '3.10', '3.11'] python-version: ['3.8', '3.9', '3.10', '3.11']
pytorch-version: ['2.2.1'] # Must be the most recent version that meets requirements-cuda.txt. pytorch-version: ['2.3.0'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1'] cuda-version: ['11.8', '12.1']
steps: steps:
...@@ -79,6 +79,8 @@ jobs: ...@@ -79,6 +79,8 @@ jobs:
- name: Build wheel - name: Build wheel
shell: bash shell: bash
env:
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
run: | run: |
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }} bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
wheel_name=$(ls dist/*whl | xargs -n 1 basename) wheel_name=$(ls dist/*whl | xargs -n 1 basename)
......
...@@ -8,7 +8,7 @@ module.exports = async (github, context, core) => { ...@@ -8,7 +8,7 @@ module.exports = async (github, context, core) => {
generate_release_notes: true, generate_release_notes: true,
name: process.env.RELEASE_TAG, name: process.env.RELEASE_TAG,
owner: context.repo.owner, owner: context.repo.owner,
prerelease: false, prerelease: true,
repo: context.repo.repo, repo: context.repo.repo,
tag_name: process.env.RELEASE_TAG, tag_name: process.env.RELEASE_TAG,
}); });
......
...@@ -32,7 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 ...@@ -32,7 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# requirements.txt files and should be kept consistent. The ROCm torch # requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm # versions are derived from Dockerfile.rocm
# #
set(TORCH_SUPPORTED_VERSION_CUDA "2.2.1") set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1") set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1") set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
...@@ -178,6 +178,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -178,6 +178,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/marlin/marlin_cuda_kernel.cu" "csrc/quantization/marlin/marlin_cuda_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/custom_all_reduce.cu") "csrc/custom_all_reduce.cu")
endif() endif()
......
# The vLLM Dockerfile is used to construct vLLM image that can be directly used # The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server. # to run the OpenAI compatible server.
# Please update any changes made here to
# docs/source/dev/dockerfile/dockerfile.rst and
# docs/source/assets/dev/dockerfile-stages-dependency.png
#################### BASE BUILD IMAGE #################### #################### BASE BUILD IMAGE ####################
# prepare basic build environment # prepare basic build environment
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
RUN apt-get update -y \ RUN apt-get update -y \
&& apt-get install -y python3-pip git && apt-get install -y python3-pip git
...@@ -12,7 +16,7 @@ RUN apt-get update -y \ ...@@ -12,7 +16,7 @@ RUN apt-get update -y \
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image # this won't be needed for future versions of this docker image
# or future versions of triton. # or future versions of triton.
RUN ldconfig /usr/local/cuda-12.1/compat/ RUN ldconfig /usr/local/cuda-12.4/compat/
WORKDIR /workspace WORKDIR /workspace
...@@ -71,6 +75,10 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ ...@@ -71,6 +75,10 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/pip \
python3 setup.py bdist_wheel --dist-dir=dist python3 setup.py bdist_wheel --dist-dir=dist
# check the size of the wheel, we cannot upload wheels larger than 100MB
COPY .buildkite/check-wheel-size.py check-wheel-size.py
RUN python3 check-wheel-size.py dist
# the `vllm_nccl` package must be installed from source distribution # the `vllm_nccl` package must be installed from source distribution
# pip is too smart to store a wheel in the cache, and other CI jobs # pip is too smart to store a wheel in the cache, and other CI jobs
# will directly use the wheel from the cache, which is not what we want. # will directly use the wheel from the cache, which is not what we want.
...@@ -85,7 +93,7 @@ FROM dev as flash-attn-builder ...@@ -85,7 +93,7 @@ FROM dev as flash-attn-builder
ARG max_jobs=2 ARG max_jobs=2
ENV MAX_JOBS=${max_jobs} ENV MAX_JOBS=${max_jobs}
# flash attention version # flash attention version
ARG flash_attn_version=v2.5.6 ARG flash_attn_version=v2.5.8
ENV FLASH_ATTN_VERSION=${flash_attn_version} ENV FLASH_ATTN_VERSION=${flash_attn_version}
WORKDIR /usr/src/flash-attention-v2 WORKDIR /usr/src/flash-attention-v2
...@@ -98,7 +106,7 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \ ...@@ -98,7 +106,7 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
# image with vLLM installed # image with vLLM installed
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
RUN apt-get update -y \ RUN apt-get update -y \
...@@ -108,7 +116,7 @@ RUN apt-get update -y \ ...@@ -108,7 +116,7 @@ RUN apt-get update -y \
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image # this won't be needed for future versions of this docker image
# or future versions of triton. # or future versions of triton.
RUN ldconfig /usr/local/cuda-12.1/compat/ RUN ldconfig /usr/local/cuda-12.4/compat/
# install vllm wheel first, so that torch etc will be installed # install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
......
...@@ -46,7 +46,7 @@ RUN apt-get update && apt-get install -y \ ...@@ -46,7 +46,7 @@ RUN apt-get update && apt-get install -y \
### Mount Point ### ### Mount Point ###
# When launching the container, mount the code directory to /app # When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app ARG APP_MOUNT=/vllm-workspace
VOLUME [ ${APP_MOUNT} ] VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT} WORKDIR ${APP_MOUNT}
...@@ -89,15 +89,16 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \ ...@@ -89,15 +89,16 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
&& cd ../..; \ && cd ../..; \
fi fi
COPY ./ /app/vllm WORKDIR /vllm-workspace
COPY . .
RUN python3 -m pip install --upgrade pip numba RUN python3 -m pip install --upgrade pip numba
RUN cd /app \ RUN --mount=type=cache,target=/root/.cache/pip \
&& cd vllm \ pip install -U -r requirements-rocm.txt \
&& pip install -U -r requirements-rocm.txt \ && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
&& python3 setup.py install \ && python3 setup.py install \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
&& cd .. && cd ..
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
......
include LICENSE include LICENSE
include requirements-common.txt include requirements-common.txt
include requirements-cuda.txt include requirements-cuda.txt
include requirements-rocm.txt
include requirements-neuron.txt
include requirements-cpu.txt
include CMakeLists.txt include CMakeLists.txt
recursive-include cmake * recursive-include cmake *
......
...@@ -74,7 +74,7 @@ python3 setup.py install ...@@ -74,7 +74,7 @@ python3 setup.py install
+ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.4.1 - python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.4.2
## Known Issue ## Known Issue
- -
......
...@@ -74,10 +74,11 @@ vLLM seamlessly supports many Hugging Face models, including the following archi ...@@ -74,10 +74,11 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.) - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.) - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.) - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.) - OLMo (`allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.)
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.) - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.) - Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.) - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
- Phi-3 (`microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, etc.)
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.) - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
- Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.) - Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.) - Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
......
...@@ -16,20 +16,22 @@ def test_prefix(llm=None, sampling_params=None, prompts=None): ...@@ -16,20 +16,22 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
def main(args): def main(args):
llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat", llm = LLM(model=args.model,
tokenizer_mode='auto', tokenizer_mode='auto',
trust_remote_code=True, trust_remote_code=True,
enforce_eager=True, enforce_eager=True,
use_v2_block_manager=args.use_v2_block_manager,
tensor_parallel_size=args.tensor_parallel_size,
enable_prefix_caching=args.enable_prefix_caching) enable_prefix_caching=args.enable_prefix_caching)
num_prompts = 100 num_prompts = 100
prompts = [PROMPT] * num_prompts prompts = [PROMPT] * num_prompts
sampling_params = SamplingParams(temperature=0, max_tokens=100) sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
print("------warm up------") print("------warm up------")
test_prefix( test_prefix(
llm=llm, llm=llm,
prompts=prompts[:1], prompts=prompts,
sampling_params=sampling_params, sampling_params=sampling_params,
) )
...@@ -45,8 +47,16 @@ if __name__ == "__main__": ...@@ -45,8 +47,16 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Benchmark the performance with or without automatic ' description='Benchmark the performance with or without automatic '
'prefix caching.') 'prefix caching.')
parser.add_argument('--model',
type=str,
default='baichuan-inc/Baichuan2-13B-Chat')
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--output-len', type=int, default=10)
parser.add_argument('--enable-prefix-caching', parser.add_argument('--enable-prefix-caching',
action='store_true', action='store_true',
help='enable prefix caching') help='enable prefix caching')
parser.add_argument('--use-v2-block-manager',
action='store_true',
help='Use BlockSpaceMangerV2')
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)
...@@ -27,7 +27,7 @@ import time ...@@ -27,7 +27,7 @@ import time
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from typing import AsyncGenerator, List, Tuple from typing import AsyncGenerator, List, Optional, Tuple
import numpy as np import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
...@@ -58,7 +58,11 @@ def sample_sharegpt_requests( ...@@ -58,7 +58,11 @@ def sample_sharegpt_requests(
dataset_path: str, dataset_path: str,
num_requests: int, num_requests: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, int, int]]: ) -> List[Tuple[str, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
# Load the dataset. # Load the dataset.
with open(dataset_path) as f: with open(dataset_path) as f:
dataset = json.load(f) dataset = json.load(f)
...@@ -68,38 +72,32 @@ def sample_sharegpt_requests( ...@@ -68,38 +72,32 @@ def sample_sharegpt_requests(
dataset = [(data["conversations"][0]["value"], dataset = [(data["conversations"][0]["value"],
data["conversations"][1]["value"]) for data in dataset] data["conversations"][1]["value"]) for data in dataset]
# some of these will be filtered out, so sample more than we need # Shuffle the dataset.
sampled_indices = random.sample(range(len(dataset)), random.shuffle(dataset)
int(num_requests * 1.2))
dataset = [dataset[i] for i in sampled_indices]
# Tokenize the prompts and completions.
prompts = [prompt for prompt, _ in dataset]
prompt_token_ids = tokenizer(prompts).input_ids
completions = [completion for _, completion in dataset]
completion_token_ids = tokenizer(completions).input_ids
tokenized_dataset = []
for i in range(len(dataset)):
output_len = len(completion_token_ids[i])
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
# Filter out too long sequences. # Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = [] filtered_dataset: List[Tuple[str, int, int]] = []
for prompt, prompt_token_ids, output_len in tokenized_dataset: for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
break
# Tokenize the prompts and completions.
prompt = dataset[i][0]
prompt_token_ids = tokenizer(prompt).input_ids
completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids) prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4: if prompt_len < 4 or output_len < 4:
# Prune too short sequences. # Prune too short sequences.
# This is because TGI causes errors when the input or output length
# is too short.
continue continue
if prompt_len > 1024 or prompt_len + output_len > 2048: if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences. # Prune too long sequences.
continue continue
filtered_dataset.append((prompt, prompt_len, output_len)) filtered_dataset.append((prompt, prompt_len, output_len))
# Sample the requests. return filtered_dataset
sampled_requests = random.sample(filtered_dataset, num_requests)
return sampled_requests
def sample_sonnet_requests( def sample_sonnet_requests(
...@@ -361,6 +359,7 @@ def main(args: argparse.Namespace): ...@@ -361,6 +359,7 @@ def main(args: argparse.Namespace):
dataset_path=args.dataset, dataset_path=args.dataset,
num_requests=args.num_prompts, num_requests=args.num_prompts,
tokenizer=tokenizer, tokenizer=tokenizer,
fixed_output_len=args.sharegpt_output_len,
) )
elif args.dataset_name == "sharegpt": elif args.dataset_name == "sharegpt":
...@@ -368,6 +367,7 @@ def main(args: argparse.Namespace): ...@@ -368,6 +367,7 @@ def main(args: argparse.Namespace):
dataset_path=args.dataset_path, dataset_path=args.dataset_path,
num_requests=args.num_prompts, num_requests=args.num_prompts,
tokenizer=tokenizer, tokenizer=tokenizer,
fixed_output_len=args.sharegpt_output_len,
) )
elif args.dataset_name == "sonnet": elif args.dataset_name == "sonnet":
...@@ -524,6 +524,12 @@ if __name__ == "__main__": ...@@ -524,6 +524,12 @@ if __name__ == "__main__":
default=1000, default=1000,
help="Number of prompts to process.", help="Number of prompts to process.",
) )
parser.add_argument(
"--sharegpt-output-len",
type=int,
default=None,
help="Output length for each request. Overrides the output length "
"from the ShareGPT dataset.")
parser.add_argument( parser.add_argument(
"--sonnet-input-len", "--sonnet-input-len",
type=int, type=int,
......
...@@ -103,25 +103,22 @@ def run_vllm( ...@@ -103,25 +103,22 @@ def run_vllm(
) )
# Add the requests to the engine. # Add the requests to the engine.
prompts = []
sampling_params = []
for prompt, _, output_len in requests: for prompt, _, output_len in requests:
sampling_params = SamplingParams( prompts.append(prompt)
n=n, sampling_params.append(
temperature=0.0 if use_beam_search else 1.0, SamplingParams(
top_p=1.0, n=n,
use_beam_search=use_beam_search, temperature=0.0 if use_beam_search else 1.0,
ignore_eos=True, top_p=1.0,
max_tokens=output_len, use_beam_search=use_beam_search,
) ignore_eos=True,
# FIXME(woosuk): Do not use internal method. max_tokens=output_len,
llm._add_request( ))
prompt=prompt,
prompt_token_ids=None,
sampling_params=sampling_params,
)
start = time.perf_counter() start = time.perf_counter()
# FIXME(woosuk): Do not use internal method. llm.generate(prompts, sampling_params, use_tqdm=True)
llm._run_engine(use_tqdm=True)
end = time.perf_counter() end = time.perf_counter()
return end - start return end - start
......
...@@ -6,7 +6,7 @@ from typing import Optional ...@@ -6,7 +6,7 @@ from typing import Optional
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from vllm._C import ops from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.aqlm import ( from vllm.model_executor.layers.quantization.aqlm import (
dequantize_weight, generic_dequantize_gemm, get_int_dtype, dequantize_weight, generic_dequantize_gemm, get_int_dtype,
optimized_dequantize_gemm) optimized_dequantize_gemm)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment