Commit b9e12416 authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.4.3

parents e5d707db e9d3aa04
import os import os
import zipfile import zipfile
MAX_SIZE_MB = 100 MAX_SIZE_MB = 200
def print_top_10_largest_files(zip_file): def print_top_10_largest_files(zip_file):
......
# This script build the ROCm docker image and runs test inside it. # This script runs test inside the corresponding ROCm docker container.
set -ex set -ex
# Print ROCm version # Print ROCm version
echo "--- ROCm info" echo "--- ROCm info"
rocminfo rocminfo
# cleanup older docker images
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes
docker volume prune -f
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
# Call the cleanup docker function
cleanup_docker
echo "--- Resetting GPUs" echo "--- Resetting GPUs"
echo "reset" > /opt/amdgpu/etc/gpu_state echo "reset" > /opt/amdgpu/etc/gpu_state
...@@ -19,15 +47,16 @@ done ...@@ -19,15 +47,16 @@ done
echo "--- Building container" echo "--- Building container"
sha=$(git rev-parse --short HEAD) sha=$(git rev-parse --short HEAD)
container_name=rocm_${sha} image_name=rocm_${sha}
container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
docker build \ docker build \
-t ${container_name} \ -t ${image_name} \
-f Dockerfile.rocm \ -f Dockerfile.rocm \
--progress plain \ --progress plain \
. .
remove_docker_container() { remove_docker_container() {
docker rm -f ${container_name} || docker image rm -f ${container_name} || true docker rm -f ${container_name} || docker image rm -f ${image_name} || true
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
...@@ -39,6 +68,6 @@ docker run \ ...@@ -39,6 +68,6 @@ docker run \
--rm \ --rm \
-e HF_TOKEN \ -e HF_TOKEN \
--name ${container_name} \ --name ${container_name} \
${container_name} \ ${image_name} \
/bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//") /bin/bash -c "${@}"
...@@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.." ...@@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite # run python-based benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$? bench_latency_exit_code=$?
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$? bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite # run server-based benchmarks and upload the result to buildkite
...@@ -74,4 +74,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then ...@@ -74,4 +74,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
exit $bench_serving_exit_code exit $bench_serving_exit_code
fi fi
/workspace/buildkite-agent artifact upload openai-*.json rm ShareGPT_V3_unfiltered_cleaned_split.json
/workspace/buildkite-agent artifact upload "*.json"
...@@ -11,4 +11,4 @@ trap remove_docker_container EXIT ...@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and launch offline inference # Run the image and launch offline inference
docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
...@@ -5,13 +5,16 @@ ...@@ -5,13 +5,16 @@
steps: steps:
- label: Regression Test - label: Regression Test
mirror_hardwares: [amd]
command: pytest -v -s test_regression.py command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional working_dir: "/vllm-workspace/tests" # optional
- label: AsyncEngine Test - label: AsyncEngine Test
#mirror_hardwares: [amd]
command: pytest -v -s async_engine command: pytest -v -s async_engine
- label: Basic Correctness Test - label: Basic Correctness Test
mirror_hardwares: [amd]
commands: commands:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
...@@ -24,59 +27,68 @@ steps: ...@@ -24,59 +27,68 @@ steps:
command: pytest -v -s core command: pytest -v -s core
- label: Distributed Comm Ops Test - label: Distributed Comm Ops Test
command: pytest -v -s test_comm_ops.py #mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests/distributed" command: pytest -v -s distributed/test_comm_ops.py
working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
- label: Distributed Tests - label: Distributed Tests
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now.
mirror_hardwares: [amd] mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
commands: commands:
- pytest -v -s test_pynccl_library.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
- pytest -v -s spec_decode/e2e/test_integration_dist.py
- label: Distributed Tests (Multiple Groups) - label: Distributed Tests (Multiple Groups)
working_dir: "/vllm-workspace/tests/distributed" #mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
commands: commands:
- pytest -v -s test_pynccl.py - pytest -v -s distributed/test_pynccl.py
- label: Engine Test - label: Engine Test
mirror_hardwares: [amd] mirror_hardwares: [amd]
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
- label: Entrypoints Test - label: Entrypoints Test
mirror_hardwares: [amd]
commands: commands:
# these tests have to be separated, because each one will allocate all posible GPU memory - pytest -v -s test_inputs.py
- pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py - pytest -v -s entrypoints -m llm
- pytest -v -s entrypoints/test_server_oot_registration.py - pytest -v -s entrypoints -m openai
- label: Examples Test - label: Examples Test
working_dir: "/vllm-workspace/examples" working_dir: "/vllm-workspace/examples"
mirror_hardwares: [amd] mirror_hardwares: [amd]
commands: commands:
# install aws cli for llava_example.py # install aws cli for llava_example.py
- pip install awscli # install tensorizer for tensorize_vllm_model.py
- pip install awscli tensorizer
- python3 offline_inference.py - python3 offline_inference.py
- python3 offline_inference_with_prefix.py - python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py - python3 llm_engine_example.py
- python3 llava_example.py - python3 llava_example.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- label: Kernels Test %N - label: Kernels Test %N
#mirror_hardwares: [amd]
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4 parallelism: 4
- label: Models Test - label: Models Test
mirror_hardwares: [amd] #mirror_hardwares: [amd]
commands: commands:
- bash ../.buildkite/download-images.sh - bash ../.buildkite/download-images.sh
- pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py - pytest -v -s models --ignore=models/test_llava.py
- label: Llava Test - label: Llava Test
mirror_hardwares: [amd] mirror_hardwares: [amd]
...@@ -90,31 +102,53 @@ steps: ...@@ -90,31 +102,53 @@ steps:
- pytest -v -s prefix_caching - pytest -v -s prefix_caching
- label: Samplers Test - label: Samplers Test
#mirror_hardwares: [amd]
command: pytest -v -s samplers command: pytest -v -s samplers
- label: LogitsProcessor Test - label: LogitsProcessor Test
mirror_hardwares: [amd] mirror_hardwares: [amd]
command: pytest -v -s test_logits_processor.py command: pytest -v -s test_logits_processor.py
- label: Utils Test
command: pytest -v -s test_utils.py
- label: Worker Test - label: Worker Test
mirror_hardwares: [amd] mirror_hardwares: [amd]
command: pytest -v -s worker command: pytest -v -s worker
- label: Speculative decoding tests - label: Speculative decoding tests
mirror_hardwares: [amd] #mirror_hardwares: [amd]
command: pytest -v -s spec_decode command: pytest -v -s spec_decode
- label: LoRA Test %N - label: LoRA Test %N
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT #mirror_hardwares: [amd]
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4 parallelism: 4
- label: LoRA Long Context (Distributed)
#mirror_hardwares: [amd]
num_gpus: 4
# This test runs llama 13B, so it is required to run on 4 GPUs.
commands:
# Temporarily run this way because we cannot clean up GPU mem usage
# for multi GPU tests.
# TODO(sang): Fix it.
- pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
- pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
- pytest -v -s lora/test_long_context.py::test_self_consistency
- pytest -v -s lora/test_long_context.py::test_quality
- pytest -v -s lora/test_long_context.py::test_max_len
- label: Tensorizer Test - label: Tensorizer Test
#mirror_hardwares: [amd]
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
- label: Metrics Test - label: Metrics Test
mirror_hardwares: [amd]
command: pytest -v -s metrics command: pytest -v -s metrics
- label: Quantization Test - label: Quantization Test
#mirror_hardwares: [amd]
command: pytest -v -s quantization command: pytest -v -s quantization
- label: Benchmarks - label: Benchmarks
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
{% set default_working_dir = "/vllm-workspace/tests" %} {% set default_working_dir = "/vllm-workspace/tests" %}
steps: steps:
- label: ":docker: build image" - label: ":docker: build image"
commands: commands:
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
...@@ -14,6 +13,8 @@ steps: ...@@ -14,6 +13,8 @@ steps:
automatic: automatic:
- exit_status: -1 # Agent was lost - exit_status: -1 # Agent was lost
limit: 5 limit: 5
- exit_status: -10 # Agent was lost
limit: 5
- wait - wait
- group: "AMD Tests" - group: "AMD Tests"
...@@ -24,7 +25,7 @@ steps: ...@@ -24,7 +25,7 @@ steps:
- label: "AMD: {{ step.label }}" - label: "AMD: {{ step.label }}"
agents: agents:
queue: amd queue: amd
command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
{% endif %} {% endif %}
...@@ -53,6 +54,8 @@ steps: ...@@ -53,6 +54,8 @@ steps:
automatic: automatic:
- exit_status: -1 # Agent was lost - exit_status: -1 # Agent was lost
limit: 5 limit: 5
- exit_status: -10 # Agent was lost
limit: 5
plugins: plugins:
- kubernetes: - kubernetes:
podSpec: podSpec:
......
BasedOnStyle: Google
UseTab: Never
IndentWidth: 2
ColumnLimit: 80
# Force pointers to the type for C++.
DerivePointerAlignment: false
PointerAlignment: Left
# Reordering #include statements can (and currently will) introduce errors
SortIncludes: false
# Style choices
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
IndentPPDirectives: BeforeHash
IncludeCategories:
- Regex: '^<'
Priority: 4
- Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
Priority: 3
- Regex: '^"(qoda|\.\.)/'
Priority: 2
- Regex: '.*'
Priority: 1
...@@ -59,6 +59,8 @@ body: ...@@ -59,6 +59,8 @@ body:
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````. Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues.
If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs. If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
placeholder: | placeholder: |
A clear and concise description of what the bug is. A clear and concise description of what the bug is.
......
name: clang-format
on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
pull_request:
branches:
- main
jobs:
clang-format:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install clang-format==18.1.5
- name: Running clang-format
run: |
EXCLUDES=(
'csrc/moe/topk_softmax_kernels.cu'
'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
'csrc/punica/bgmv/bgmv_config.h'
'csrc/punica/bgmv/bgmv_impl.cuh'
'csrc/punica/bgmv/vec_dtypes.cuh'
'csrc/punica/punica_ops.cu'
'csrc/punica/type_convert.h'
)
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
| xargs clang-format --dry-run --Werror
\ No newline at end of file
...@@ -58,6 +58,9 @@ jobs: ...@@ -58,6 +58,9 @@ jobs:
- name: Setup ccache - name: Setup ccache
uses: hendrikmuhs/ccache-action@v1.2 uses: hendrikmuhs/ccache-action@v1.2
with:
create-symlink: true
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
- name: Set up Linux Env - name: Set up Linux Env
if: ${{ runner.os == 'Linux' }} if: ${{ runner.os == 'Linux' }}
......
...@@ -168,19 +168,47 @@ set(VLLM_EXT_SRC ...@@ -168,19 +168,47 @@ set(VLLM_EXT_SRC
"csrc/layernorm_kernels.cu" "csrc/layernorm_kernels.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu" "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
"csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/gptq/q_gemm.cu"
# "csrc/quantization/fp8/fp8_cuda_kernels.cu" # "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
# "csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu" "csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu" "csrc/moe_align_block_size_kernels.cu"
"csrc/pybind.cpp") "csrc/pybind.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
include(FetchContent)
SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# CUTLASS 3.5.0
GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
)
FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC list(APPEND VLLM_EXT_SRC
"csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/marlin/marlin_cuda_kernel.cu" "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu" "csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/custom_all_reduce.cu") "csrc/custom_all_reduce.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
#
# The CUTLASS kernels for Hopper require sm90a to be enabled.
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
set_source_files_properties(
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
PROPERTIES
COMPILE_FLAGS
"-gencode arch=compute_90a,code=sm_90a")
endif()
endif() endif()
define_gpu_extension_target( define_gpu_extension_target(
...@@ -190,6 +218,7 @@ define_gpu_extension_target( ...@@ -190,6 +218,7 @@ define_gpu_extension_target(
SOURCES ${VLLM_EXT_SRC} SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS} COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES} ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
WITH_SOABI) WITH_SOABI)
# #
...@@ -220,7 +249,8 @@ set(VLLM_PUNICA_EXT_SRC ...@@ -220,7 +249,8 @@ set(VLLM_PUNICA_EXT_SRC
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu" "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu" "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu" "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
"csrc/punica/punica_ops.cc") "csrc/punica/punica_ops.cu"
"csrc/punica/punica_pybind.cpp")
# #
# Copy GPU compilation flags+update for punica # Copy GPU compilation flags+update for punica
...@@ -244,6 +274,9 @@ if (${VLLM_GPU_LANG} STREQUAL "CUDA") ...@@ -244,6 +274,9 @@ if (${VLLM_GPU_LANG} STREQUAL "CUDA")
endif() endif()
endforeach() endforeach()
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}") message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
endif() endif()
if (VLLM_PUNICA_GPU_ARCHES) if (VLLM_PUNICA_GPU_ARCHES)
...@@ -278,11 +311,6 @@ add_custom_target(default) ...@@ -278,11 +311,6 @@ add_custom_target(default)
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
message(STATUS "Enabling C extension.") message(STATUS "Enabling C extension.")
add_dependencies(default _C) add_dependencies(default _C)
endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Enabling moe extension.")
add_dependencies(default _moe_C)
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
...@@ -293,3 +321,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -293,3 +321,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
add_dependencies(default _punica_C) add_dependencies(default _punica_C)
endif() endif()
endif() endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Enabling moe extension.")
add_dependencies(default _moe_C)
endif()
...@@ -79,31 +79,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ ...@@ -79,31 +79,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
COPY .buildkite/check-wheel-size.py check-wheel-size.py COPY .buildkite/check-wheel-size.py check-wheel-size.py
RUN python3 check-wheel-size.py dist RUN python3 check-wheel-size.py dist
# the `vllm_nccl` package must be installed from source distribution
# pip is too smart to store a wheel in the cache, and other CI jobs
# will directly use the wheel from the cache, which is not what we want.
# we need to remove it manually
RUN --mount=type=cache,target=/root/.cache/pip \
pip cache remove vllm_nccl*
#################### EXTENSION Build IMAGE #################### #################### EXTENSION Build IMAGE ####################
#################### FLASH_ATTENTION Build IMAGE ####################
FROM dev as flash-attn-builder
# max jobs used for build
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# flash attention version
ARG flash_attn_version=v2.5.8
ENV FLASH_ATTN_VERSION=${flash_attn_version}
WORKDIR /usr/src/flash-attention-v2
# Download the wheel or build it if a pre-compiled release doesn't exist
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
--no-build-isolation --no-deps --no-cache-dir
#################### FLASH_ATTENTION Build IMAGE ####################
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
# image with vLLM installed # image with vLLM installed
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
...@@ -122,10 +99,6 @@ RUN ldconfig /usr/local/cuda-12.4/compat/ ...@@ -122,10 +99,6 @@ RUN ldconfig /usr/local/cuda-12.4/compat/
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/pip \
pip install dist/*.whl --verbose pip install dist/*.whl --verbose
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
--mount=type=cache,target=/root/.cache/pip \
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
......
...@@ -17,4 +17,6 @@ RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.py ...@@ -17,4 +17,6 @@ RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.py
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
WORKDIR /workspace/
CMD ["/bin/bash"] CMD ["/bin/bash"]
...@@ -92,16 +92,23 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \ ...@@ -92,16 +92,23 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
COPY . . COPY . .
#RUN python3 -m pip install pynvml # to be removed eventually
RUN python3 -m pip install --upgrade pip numba RUN python3 -m pip install --upgrade pip numba
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
# Workaround for ray >= 2.10.0
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip install -U -r requirements-rocm.txt \ pip install -U -r requirements-rocm.txt \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \ && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
&& python3 setup.py install \ && python3 setup.py install \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \ && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
&& cd .. && cd ..
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
CMD ["/bin/bash"] CMD ["/bin/bash"]
...@@ -39,7 +39,7 @@ vLLM支持 ...@@ -39,7 +39,7 @@ vLLM支持
1. 基于光源pytorch2.1.0基础镜像环境:镜像下载地址:[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch),根据pytorch2.1.0、python、dtk及系统下载对应的镜像版本。 1. 基于光源pytorch2.1.0基础镜像环境:镜像下载地址:[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch),根据pytorch2.1.0、python、dtk及系统下载对应的镜像版本。
2. 基于现有python环境:安装pytorch2.1.0,pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch/DAS1.0](https://cancon.hpccube.com:65024/4/main/pytorch/DAS1.0),根据python、dtk版本,下载对应pytorch2.1.0的whl包。安装命令如下: 2. 基于现有python环境:安装pytorch2.1.0,pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch),根据python、dtk版本,下载对应pytorch2.1.0的whl包。安装命令如下:
```shell ```shell
pip install torch* (下载的torch的whl包) pip install torch* (下载的torch的whl包)
pip install setuptools wheel pip install setuptools wheel
...@@ -47,7 +47,7 @@ pip install setuptools wheel ...@@ -47,7 +47,7 @@ pip install setuptools wheel
#### 源码编译安装 #### 源码编译安装
```shell ```shell
git clone https://developer.hpccube.com/codes/aicomponent/vllm # 根据需要的分支进行切换 git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git # 根据需要的分支进行切换
``` ```
- 提供2种源码编译方式(进入vllm目录): - 提供2种源码编译方式(进入vllm目录):
...@@ -62,19 +62,19 @@ python3 setup.py install ...@@ -62,19 +62,19 @@ python3 setup.py install
``` ```
#### 运行基础环境准备 #### 运行基础环境准备
1、使用基于光源pytorch2.1.0基础镜像环境:docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310 1、使用上面基于光源pytorch2.1.0基础镜像环境
2、安装对应依赖包: 2、根据pytorch2.1.0、python、dtk及系统下载对应依赖包:
- triton:[https://cancon.hpccube.com:65024/4/main/triton/DAS1.0](https://cancon.hpccube.com:65024/4/main/triton/DAS1.0) - triton:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton/)
- xformers:[https://cancon.hpccube.com:65024/4/main/xformers/DAS1.0](https://cancon.hpccube.com:65024/4/main/xformers/DAS1.0) - xformers:[https://cancon.hpccube.com:65024/4/main/xformers](https://cancon.hpccube.com:65024/4/main/xformers)
- flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn/DAS1.0](https://cancon.hpccube.com:65024/4/main/flash_attn/DAS1.0) - flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn)
#### 注意事项 #### 注意事项
+ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.4.2 - python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.4.3
## Known Issue ## Known Issue
- -
......
...@@ -14,6 +14,17 @@ Easy, fast, and cheap LLM serving for everyone ...@@ -14,6 +14,17 @@ Easy, fast, and cheap LLM serving for everyone
</p> </p>
---
**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
We are thrilled to announce our fourth vLLM Meetup!
The vLLM team will share recent updates and roadmap.
We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
Please register [here](https://lu.ma/agivllm) and join us!
---
*Latest News* 🔥 *Latest News* 🔥
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing). - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing). - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
...@@ -51,41 +62,14 @@ vLLM is flexible and easy to use with: ...@@ -51,41 +62,14 @@ vLLM is flexible and easy to use with:
- (Experimental) Prefix caching support - (Experimental) Prefix caching support
- (Experimental) Multi-lora support - (Experimental) Multi-lora support
vLLM seamlessly supports many Hugging Face models, including the following architectures: vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama)
- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.) - Mixture-of-Expert LLMs (e.g., Mixtral)
- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.) - Multi-modal LLMs (e.g., LLaVA)
- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.) Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.) ## Getting Started
- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
- LLaMA, Llama 2, and Meta Llama 3 (`meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
- MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
- OLMo (`allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.)
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
- Phi-3 (`microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, etc.)
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
- Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source): Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
...@@ -93,9 +77,7 @@ Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/get ...@@ -93,9 +77,7 @@ Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/get
pip install vllm pip install vllm
``` ```
## Getting Started Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html) - [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html) - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html) - [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
...@@ -105,6 +87,32 @@ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started ...@@ -105,6 +87,32 @@ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started
We welcome and value any contributions and collaborations. We welcome and value any contributions and collaborations.
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved. Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
## Sponsors
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
<!-- Note: Please sort them in alphabetical order. -->
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
- a16z
- AMD
- Anyscale
- AWS
- Crusoe Cloud
- Databricks
- DeepInfra
- Dropbox
- Lambda Lab
- NVIDIA
- Replicate
- Roblox
- RunPod
- Trainy
- UC Berkeley
- UC San Diego
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
## Citation ## Citation
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180): If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
......
...@@ -89,6 +89,9 @@ async def async_request_tgi( ...@@ -89,6 +89,9 @@ async def async_request_tgi(
output.latency = most_recent_timestamp - st output.latency = most_recent_timestamp - st
output.success = True output.success = True
output.generated_text = data["generated_text"] output.generated_text = data["generated_text"]
else:
output.error = response.reason or ""
output.success = False
except Exception: except Exception:
output.success = False output.success = False
exc_info = sys.exc_info() exc_info = sys.exc_info()
...@@ -276,6 +279,9 @@ async def async_request_openai_completions( ...@@ -276,6 +279,9 @@ async def async_request_openai_completions(
output.generated_text = generated_text output.generated_text = generated_text
output.success = True output.success = True
output.latency = latency output.latency = latency
else:
output.error = response.reason or ""
output.success = False
except Exception: except Exception:
output.success = False output.success = False
exc_info = sys.exc_info() exc_info = sys.exc_info()
......
"""Benchmark the latency of processing a single batch of requests.""" """Benchmark the latency of processing a single batch of requests."""
import argparse import argparse
import json
import time import time
from pathlib import Path from pathlib import Path
from typing import Optional from typing import List, Optional
import numpy as np import numpy as np
import torch import torch
from tqdm import tqdm from tqdm import tqdm
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.inputs import PromptStrictInputs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
...@@ -18,6 +20,8 @@ def main(args: argparse.Namespace): ...@@ -18,6 +20,8 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch, # NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches. # the engine will automatically process the request in multiple batches.
llm = LLM(model=args.model, llm = LLM(model=args.model,
speculative_model=args.speculative_model,
num_speculative_tokens=args.num_speculative_tokens,
tokenizer=args.tokenizer, tokenizer=args.tokenizer,
quantization=args.quantization, quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size, tensor_parallel_size=args.tensor_parallel_size,
...@@ -28,9 +32,11 @@ def main(args: argparse.Namespace): ...@@ -28,9 +32,11 @@ def main(args: argparse.Namespace):
quantization_param_path=args.quantization_param_path, quantization_param_path=args.quantization_param_path,
device=args.device, device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight, ray_workers_use_nsight=args.ray_workers_use_nsight,
use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill, enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir, download_dir=args.download_dir,
block_size=args.block_size) block_size=args.block_size,
gpu_memory_utilization=args.gpu_memory_utilization)
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=args.n, n=args.n,
...@@ -44,7 +50,9 @@ def main(args: argparse.Namespace): ...@@ -44,7 +50,9 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids = np.random.randint(10000, dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size, size=(args.batch_size,
args.input_len)) args.input_len))
dummy_prompt_token_ids = dummy_prompt_token_ids.tolist() dummy_inputs: List[PromptStrictInputs] = [{
"prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()]
def run_to_completion(profile_dir: Optional[str] = None): def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir: if profile_dir:
...@@ -55,13 +63,13 @@ def main(args: argparse.Namespace): ...@@ -55,13 +63,13 @@ def main(args: argparse.Namespace):
], ],
on_trace_ready=torch.profiler.tensorboard_trace_handler( on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p: str(profile_dir))) as p:
llm.generate(prompt_token_ids=dummy_prompt_token_ids, llm.generate(dummy_inputs,
sampling_params=sampling_params, sampling_params=sampling_params,
use_tqdm=False) use_tqdm=False)
print(p.key_averages()) print(p.key_averages())
else: else:
start_time = time.perf_counter() start_time = time.perf_counter()
llm.generate(prompt_token_ids=dummy_prompt_token_ids, llm.generate(dummy_inputs,
sampling_params=sampling_params, sampling_params=sampling_params,
use_tqdm=False) use_tqdm=False)
end_time = time.perf_counter() end_time = time.perf_counter()
...@@ -93,12 +101,24 @@ def main(args: argparse.Namespace): ...@@ -93,12 +101,24 @@ def main(args: argparse.Namespace):
for percentage, percentile in zip(percentages, percentiles): for percentage, percentile in zip(percentages, percentiles):
print(f'{percentage}% percentile latency: {percentile} seconds') print(f'{percentage}% percentile latency: {percentile} seconds')
# Output JSON results if specified
if args.output_json:
results = {
"avg_latency": np.mean(latencies),
"latencies": latencies.tolist(),
"percentiles": dict(zip(percentages, percentiles.tolist())),
}
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Benchmark the latency of processing a single batch of ' description='Benchmark the latency of processing a single batch of '
'requests till completion.') 'requests till completion.')
parser.add_argument('--model', type=str, default='facebook/opt-125m') parser.add_argument('--model', type=str, default='facebook/opt-125m')
parser.add_argument('--speculative-model', type=str, default=None)
parser.add_argument('--num-speculative-tokens', type=int, default=None)
parser.add_argument('--tokenizer', type=str, default=None) parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization', parser.add_argument('--quantization',
'-q', '-q',
...@@ -137,15 +157,13 @@ if __name__ == '__main__': ...@@ -137,15 +157,13 @@ if __name__ == '__main__':
action='store_true', action='store_true',
help='enforce eager mode and disable CUDA graph') help='enforce eager mode and disable CUDA graph')
parser.add_argument( parser.add_argument(
"--kv-cache-dtype", '--kv-cache-dtype',
type=str, type=str,
choices=['auto', 'fp8'], choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
default='auto', default="auto",
help= help='Data type for kv cache storage. If "auto", will use model '
'Data type for kv cache storage. If "auto", will use model data type. ' 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'FP8_E5M2 (without scaling) is only supported on cuda version greater ' 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
'common inference criteria.')
parser.add_argument( parser.add_argument(
'--quantization-param-path', '--quantization-param-path',
type=str, type=str,
...@@ -181,6 +199,7 @@ if __name__ == '__main__': ...@@ -181,6 +199,7 @@ if __name__ == '__main__':
action='store_true', action='store_true',
help='If True, the prefill requests can be chunked based on the ' help='If True, the prefill requests can be chunked based on the '
'max_num_batched_tokens') 'max_num_batched_tokens')
parser.add_argument('--use-v2-block-manager', action='store_true')
parser.add_argument( parser.add_argument(
"--ray-workers-use-nsight", "--ray-workers-use-nsight",
action='store_true', action='store_true',
...@@ -191,5 +210,16 @@ if __name__ == '__main__': ...@@ -191,5 +210,16 @@ if __name__ == '__main__':
default=None, default=None,
help='directory to download and load the weights, ' help='directory to download and load the weights, '
'default to the default cache dir of huggingface') 'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the latency results in JSON format.')
parser.add_argument('--gpu-memory-utilization',
type=float,
default=0.9,
help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.')
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)
...@@ -17,6 +17,10 @@ On the client side, run: ...@@ -17,6 +17,10 @@ On the client side, run:
--dataset-path <path to dataset> \ --dataset-path <path to dataset> \
--request-rate <request_rate> \ # By default <request_rate> is inf --request-rate <request_rate> \ # By default <request_rate> is inf
--num-prompts <num_prompts> # By default <num_prompts> is 1000 --num-prompts <num_prompts> # By default <num_prompts> is 1000
when using tgi backend, add
--endpoint /generate_stream
to the end of the command above.
""" """
import argparse import argparse
import asyncio import asyncio
...@@ -211,6 +215,11 @@ def calculate_metrics( ...@@ -211,6 +215,11 @@ def calculate_metrics(
else: else:
actual_output_lens.append(0) actual_output_lens.append(0)
if completed == 0:
warnings.warn(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments.",
stacklevel=2)
metrics = BenchmarkMetrics( metrics = BenchmarkMetrics(
completed=completed, completed=completed,
total_input=total_input, total_input=total_input,
...@@ -222,9 +231,9 @@ def calculate_metrics( ...@@ -222,9 +231,9 @@ def calculate_metrics(
1000, # ttfts is empty if streaming is not supported by backend 1000, # ttfts is empty if streaming is not supported by backend
median_ttft_ms=np.median(ttfts or 0) * 1000, median_ttft_ms=np.median(ttfts or 0) * 1000,
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000, p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
mean_tpot_ms=np.mean(tpots) * 1000, mean_tpot_ms=np.mean(tpots or 0) * 1000,
median_tpot_ms=np.median(tpots) * 1000, median_tpot_ms=np.median(tpots or 0) * 1000,
p99_tpot_ms=np.percentile(tpots, 99) * 1000, p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
) )
return metrics, actual_output_lens return metrics, actual_output_lens
...@@ -246,6 +255,24 @@ async def benchmark( ...@@ -246,6 +255,24 @@ async def benchmark(
else: else:
raise ValueError(f"Unknown backend: {backend}") raise ValueError(f"Unknown backend: {backend}")
print("Starting initial single prompt test run...")
test_prompt, test_prompt_len, test_output_len = input_requests[0]
test_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
api_url=api_url,
prompt_len=test_prompt_len,
output_len=test_output_len,
best_of=best_of,
use_beam_search=use_beam_search,
)
test_output = await request_func(request_func_input=test_input)
if not test_output.success:
raise ValueError(
"Initial test run failed - Please make sure benchmark arguments "
f"are correctly specified. Error: {test_output.error}")
else:
print("Initial test run completed. Starting main benchmark run...")
print(f"Traffic request rate: {request_rate}") print(f"Traffic request rate: {request_rate}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests)) pbar = None if disable_tqdm else tqdm(total=len(input_requests))
......
...@@ -249,6 +249,18 @@ def main(args: argparse.Namespace): ...@@ -249,6 +249,18 @@ def main(args: argparse.Namespace):
print(f"Generate Throughput: {total_out_tokens / elapsed_time:.2f} tokens/s") print(f"Generate Throughput: {total_out_tokens / elapsed_time:.2f} tokens/s")
# Output JSON results if specified
if args.output_json:
results = {
"elapsed_time": elapsed_time,
"num_requests": len(requests),
"total_num_tokens": total_num_tokens,
"requests_per_second": len(requests) / elapsed_time,
"tokens_per_second": total_num_tokens / elapsed_time,
}
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark the throughput.") parser = argparse.ArgumentParser(description="Benchmark the throughput.")
...@@ -318,15 +330,13 @@ if __name__ == "__main__": ...@@ -318,15 +330,13 @@ if __name__ == "__main__":
action="store_true", action="store_true",
help="enforce eager execution") help="enforce eager execution")
parser.add_argument( parser.add_argument(
"--kv-cache-dtype", '--kv-cache-dtype',
type=str, type=str,
choices=["auto", "fp8"], choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
default="auto", default="auto",
help= help='Data type for kv cache storage. If "auto", will use model '
'Data type for kv cache storage. If "auto", will use model data type. ' 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'FP8_E5M2 (without scaling) is only supported on cuda version greater ' 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
'common inference criteria.')
parser.add_argument( parser.add_argument(
'--quantization-param-path', '--quantization-param-path',
type=str, type=str,
...@@ -360,6 +370,11 @@ if __name__ == "__main__": ...@@ -360,6 +370,11 @@ if __name__ == "__main__":
default=None, default=None,
help='directory to download and load the weights, ' help='directory to download and load the weights, '
'default to the default cache dir of huggingface') 'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
args = parser.parse_args() args = parser.parse_args()
if args.tokenizer is None: if args.tokenizer is None:
args.tokenizer = args.model args.tokenizer = args.model
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment