Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f48954a4
Commit
f48954a4
authored
Jun 12, 2024
by
zhuwenwen
Browse files
merge v0.5.0
parents
1dba29d3
8f89d720
Changes
253
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
948 additions
and
267 deletions
+948
-267
.buildkite/nightly-benchmarks/kickoff-pipeline.sh
.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+26
-0
.buildkite/nightly-benchmarks/sample.yaml
.buildkite/nightly-benchmarks/sample.yaml
+39
-0
.buildkite/run-benchmarks.sh
.buildkite/run-benchmarks.sh
+4
-4
.buildkite/run-cpu-test.sh
.buildkite/run-cpu-test.sh
+12
-2
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+17
-14
.buildkite/test-template-aws.j2
.buildkite/test-template-aws.j2
+64
-0
.buildkite/test-template.j2
.buildkite/test-template.j2
+5
-2
.github/workflows/mypy.yaml
.github/workflows/mypy.yaml
+1
-0
CMakeLists.txt
CMakeLists.txt
+9
-21
Dockerfile.cpu
Dockerfile.cpu
+6
-2
Dockerfile.neuron
Dockerfile.neuron
+1
-1
Dockerfile.rocm
Dockerfile.rocm
+3
-2
README_ORIGIN.md
README_ORIGIN.md
+8
-0
benchmarks/benchmark_latency.py
benchmarks/benchmark_latency.py
+9
-1
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+22
-1
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+11
-2
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+352
-0
benchmarks/cutlass_benchmarks/weight_shapes.py
benchmarks/cutlass_benchmarks/weight_shapes.py
+37
-0
benchmarks/kernels/benchmark_mixtral_moe.py
benchmarks/kernels/benchmark_mixtral_moe.py
+0
-215
benchmarks/kernels/benchmark_moe.py
benchmarks/kernels/benchmark_moe.py
+322
-0
No files found.
.buildkite/nightly-benchmarks/kickoff-pipeline.sh
0 → 100755
View file @
f48954a4
#!/usr/bin/env bash
set
-euo
pipefail
# Install system packages
apt update
apt
install
-y
curl jq
# Install minijinja for templating
curl
-sSfL
https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
source
$HOME
/.cargo/env
# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
if
[
"
$BUILDKITE_PULL_REQUEST
"
!=
"false"
]
;
then
PR_LABELS
=
$(
curl
-s
"https://api.github.com/repos/vllm-project/vllm/pulls/
$BUILDKITE_PULL_REQUEST
"
| jq
-r
'.labels[].name'
)
if
[[
$PR_LABELS
==
*
"perf-benchmarks"
*
]]
;
then
echo
"This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
else
echo
"This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
exit
0
fi
fi
# Upload sample.yaml
buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml
.buildkite/nightly-benchmarks/sample.yaml
0 → 100644
View file @
f48954a4
steps
:
# NOTE(simon): You can create separate blocks for different jobs
-
label
:
"
A100:
NVIDIA
SMI"
agents
:
queue
:
A100
plugins
:
-
kubernetes
:
podSpec
:
containers
:
# - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT
# TODO(simon): check latest main branch or use the PR image.
-
image
:
us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
command
:
-
bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls'
resources
:
limits
:
nvidia.com/gpu
:
8
volumeMounts
:
-
name
:
devshm
mountPath
:
/dev/shm
nodeSelector
:
nvidia.com/gpu.product
:
NVIDIA-A100-SXM4-80GB
volumes
:
-
name
:
devshm
emptyDir
:
medium
:
Memory
# TODO(simon): bring H100 online
# - label: "H100: NVIDIA SMI"
# agents:
# queue: H100
# plugins:
# - docker#v5.11.0:
# image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
# command:
# - bash -c 'nvidia-smi && nvidia-smi topo -m'
# propagate-environment: true
# ipc: host
# gpus: all
.buildkite/run-benchmarks.sh
View file @
f48954a4
...
@@ -50,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
...
@@ -50,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
sed
-n
'1p'
benchmark_serving.txt
>>
benchmark_results.md
# first line
sed
-n
'1p'
benchmark_serving.txt
>>
benchmark_results.md
# first line
echo
""
>>
benchmark_results.md
echo
""
>>
benchmark_results.md
echo
'```'
>>
benchmark_results.md
echo
'```'
>>
benchmark_results.md
tail
-n
2
0
benchmark_serving.txt
>>
benchmark_results.md
# last 2
0
lines
tail
-n
2
4
benchmark_serving.txt
>>
benchmark_results.md
# last 2
4
lines
echo
'```'
>>
benchmark_results.md
echo
'```'
>>
benchmark_results.md
# if the agent binary is not found, skip uploading the results, exit 0
# if the agent binary is not found, skip uploading the results, exit 0
if
[
!
-f
/
workspace
/buildkite-agent
]
;
then
if
[
!
-f
/
usr/bin
/buildkite-agent
]
;
then
exit
0
exit
0
fi
fi
# upload the results to buildkite
# upload the results to buildkite
/workspace/
buildkite-agent annotate
--style
"info"
--context
"benchmark-results"
< benchmark_results.md
buildkite-agent annotate
--style
"info"
--context
"benchmark-results"
< benchmark_results.md
# exit with the exit code of the benchmarks
# exit with the exit code of the benchmarks
if
[
$bench_latency_exit_code
-ne
0
]
;
then
if
[
$bench_latency_exit_code
-ne
0
]
;
then
...
@@ -75,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
...
@@ -75,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
fi
fi
rm
ShareGPT_V3_unfiltered_cleaned_split.json
rm
ShareGPT_V3_unfiltered_cleaned_split.json
/workspace/
buildkite-agent artifact upload
"*.json"
buildkite-agent artifact upload
"*.json"
.buildkite/run-cpu-test.sh
View file @
f48954a4
...
@@ -10,5 +10,15 @@ remove_docker_container() { docker rm -f cpu-test || true; }
...
@@ -10,5 +10,15 @@ remove_docker_container() { docker rm -f cpu-test || true; }
trap
remove_docker_container EXIT
trap
remove_docker_container EXIT
remove_docker_container
remove_docker_container
# Run the image and launch offline inference
# Run the image
docker run
--network
host
--env
VLLM_CPU_KVCACHE_SPACE
=
1
--name
cpu-test cpu-test python3 vllm/examples/offline_inference.py
docker run
-itd
-v
~/.cache/huggingface:/root/.cache/huggingface
--cpuset-cpus
=
48-95
--cpuset-mems
=
1
--network
host
-e
HF_TOKEN
--env
VLLM_CPU_KVCACHE_SPACE
=
4
--name
cpu-test cpu-test
# offline inference
docker
exec
cpu-test bash
-c
"python3 examples/offline_inference.py"
# Run basic model test
docker
exec
cpu-test bash
-c
"cd tests;
pip install pytest Pillow protobuf
bash ../.buildkite/download-images.sh
cd ../
pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
.buildkite/test-pipeline.yaml
View file @
f48954a4
...
@@ -37,6 +37,7 @@ steps:
...
@@ -37,6 +37,7 @@ steps:
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
num_gpus
:
2
commands
:
commands
:
-
VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
...
@@ -46,6 +47,7 @@ steps:
...
@@ -46,6 +47,7 @@ steps:
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-
pytest -v -s spec_decode/e2e/test_integration_dist.py
-
pytest -v -s spec_decode/e2e/test_integration_dist.py
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-
label
:
Distributed Tests (Multiple Groups)
-
label
:
Distributed Tests (Multiple Groups)
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
...
@@ -62,7 +64,6 @@ steps:
...
@@ -62,7 +64,6 @@ steps:
mirror_hardwares
:
[
amd
]
mirror_hardwares
:
[
amd
]
commands
:
commands
:
-
pytest -v -s test_inputs.py
-
pytest -v -s entrypoints -m llm
-
pytest -v -s entrypoints -m llm
-
pytest -v -s entrypoints -m openai
-
pytest -v -s entrypoints -m openai
...
@@ -79,6 +80,13 @@ steps:
...
@@ -79,6 +80,13 @@ steps:
-
python3 llava_example.py
-
python3 llava_example.py
-
python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-
python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-
label
:
Inputs Test
#mirror_hardwares: [amd]
commands
:
-
bash ../.buildkite/download-images.sh
-
pytest -v -s test_inputs.py
-
pytest -v -s multimodal
-
label
:
Kernels Test %N
-
label
:
Kernels Test %N
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
command
:
pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
command
:
pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
...
@@ -87,14 +95,13 @@ steps:
...
@@ -87,14 +95,13 @@ steps:
-
label
:
Models Test
-
label
:
Models Test
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
commands
:
commands
:
-
bash ../.buildkite/download-images.sh
-
pytest -v -s models -m \"not llava\"
-
pytest -v -s models --ignore=models/test_llava.py
-
label
:
Llava Test
-
label
:
Llava Test
mirror_hardwares
:
[
amd
]
mirror_hardwares
:
[
amd
]
commands
:
commands
:
-
bash ../.buildkite/download-images.sh
-
bash ../.buildkite/download-images.sh
-
pytest -v -s models
/test_
llava
.py
-
pytest -v -s models
-m
llava
-
label
:
Prefix Caching Test
-
label
:
Prefix Caching Test
mirror_hardwares
:
[
amd
]
mirror_hardwares
:
[
amd
]
...
@@ -118,7 +125,10 @@ steps:
...
@@ -118,7 +125,10 @@ steps:
-
label
:
Speculative decoding tests
-
label
:
Speculative decoding tests
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
command
:
pytest -v -s spec_decode
commands
:
# See https://github.com/vllm-project/vllm/issues/5152
-
export VLLM_ATTENTION_BACKEND=XFORMERS
-
pytest -v -s spec_decode
-
label
:
LoRA Test %N
-
label
:
LoRA Test %N
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
...
@@ -130,14 +140,7 @@ steps:
...
@@ -130,14 +140,7 @@ steps:
num_gpus
:
4
num_gpus
:
4
# This test runs llama 13B, so it is required to run on 4 GPUs.
# This test runs llama 13B, so it is required to run on 4 GPUs.
commands
:
commands
:
# Temporarily run this way because we cannot clean up GPU mem usage
-
pytest -v -s -x lora/test_long_context.py
# for multi GPU tests.
# TODO(sang): Fix it.
-
pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
-
pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
-
pytest -v -s lora/test_long_context.py::test_self_consistency
-
pytest -v -s lora/test_long_context.py::test_quality
-
pytest -v -s lora/test_long_context.py::test_max_len
-
label
:
Tensorizer Test
-
label
:
Tensorizer Test
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
...
...
.buildkite/test-template-aws.j2
0 → 100644
View file @
f48954a4
{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
{% set default_working_dir = "/vllm-workspace/tests" %}
steps:
- label: ":docker: build image"
agents:
queue: cpu_queue
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
- exit_status: -10 # Agent was lost
limit: 5
- wait
{% for step in steps %}
- label: "{{ step.label }}"
agents:
{% if step.label == "Documentation Build" %}
queue: small_cpu_queue
{% elif step.no_gpu %}
queue: cpu_queue
{% elif step.num_gpus == 2 or step.num_gpus == 4 %}
queue: gpu_4_queue
{% else %}
queue: gpu_1_queue
{% endif %}
soft_fail: true
{% if step.parallelism %}
parallelism: {{ step.parallelism }}
{% endif %}
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
- exit_status: -10 # Agent was lost
limit: 5
plugins:
- docker#v5.2.0:
image: {{ docker_image }}
always-pull: true
propagate-environment: true
{% if not step.no_gpu %}
gpus: all
{% endif %}
{% if step.label == "Benchmarks" %}
mount-buildkite-agent: true
{% endif %}
command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"]
environment:
- VLLM_USAGE_SOURCE=ci-test
- HF_TOKEN
{% if step.label == "Speculative decoding tests" %}
- VLLM_ATTENTION_BACKEND=XFORMERS
{% endif %}
volumes:
- /dev/shm:/dev/shm
{% endfor %}
.buildkite/test-template.j2
View file @
f48954a4
...
@@ -28,6 +28,7 @@ steps:
...
@@ -28,6 +28,7 @@ steps:
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
env:
env:
DOCKER_BUILDKIT: "1"
DOCKER_BUILDKIT: "1"
soft_fail: true
{% endif %}
{% endif %}
{% endfor %}
{% endfor %}
...
@@ -36,10 +37,12 @@ steps:
...
@@ -36,10 +37,12 @@ steps:
agents:
agents:
queue: neuron
queue: neuron
command: bash .buildkite/run-neuron-test.sh
command: bash .buildkite/run-neuron-test.sh
soft_fail:
tru
e
soft_fail:
fals
e
- label: "Intel Test"
- label: "Intel Test"
depends_on: ~
depends_on: ~
agents:
queue: intel
command: bash .buildkite/run-cpu-test.sh
command: bash .buildkite/run-cpu-test.sh
{% for step in steps %}
{% for step in steps %}
...
...
.github/workflows/mypy.yaml
View file @
f48954a4
...
@@ -37,6 +37,7 @@ jobs:
...
@@ -37,6 +37,7 @@ jobs:
mypy vllm/distributed --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
mypy vllm/multimodal --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml
...
...
CMakeLists.txt
View file @
f48954a4
...
@@ -67,19 +67,6 @@ endif()
...
@@ -67,19 +67,6 @@ endif()
#
#
find_package
(
Torch REQUIRED
)
find_package
(
Torch REQUIRED
)
#
# Normally `torch.utils.cpp_extension.CUDAExtension` would add
# `libtorch_python.so` for linking against an extension. Torch's cmake
# configuration does not include this library (presumably since the cmake
# config is used for standalone C++ binaries that link against torch).
# The `libtorch_python.so` library defines some of the glue code between
# torch/python via pybind and is required by VLLM extensions for this
# reason. So, add it by manually with `find_library` using torch's
# installed library path.
#
find_library
(
torch_python_LIBRARY torch_python PATHS
"
${
TORCH_INSTALL_PREFIX
}
/lib"
)
#
#
# Forward the non-CUDA device extensions to external CMake scripts.
# Forward the non-CUDA device extensions to external CMake scripts.
#
#
...
@@ -172,7 +159,7 @@ set(VLLM_EXT_SRC
...
@@ -172,7 +159,7 @@ set(VLLM_EXT_SRC
# "csrc/quantization/fp8/common.cu"
# "csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
"csrc/
pybind
.cpp"
)
"csrc/
torch_bindings
.cpp"
)
if
(
VLLM_GPU_LANG STREQUAL
"CUDA"
)
if
(
VLLM_GPU_LANG STREQUAL
"CUDA"
)
include
(
FetchContent
)
include
(
FetchContent
)
...
@@ -219,6 +206,7 @@ define_gpu_extension_target(
...
@@ -219,6 +206,7 @@ define_gpu_extension_target(
COMPILE_FLAGS
${
VLLM_GPU_FLAGS
}
COMPILE_FLAGS
${
VLLM_GPU_FLAGS
}
ARCHITECTURES
${
VLLM_GPU_ARCHES
}
ARCHITECTURES
${
VLLM_GPU_ARCHES
}
INCLUDE_DIRECTORIES
${
CUTLASS_INCLUDE_DIR
}
;
${
CUTLASS_TOOLS_UTIL_INCLUDE_DIR
}
INCLUDE_DIRECTORIES
${
CUTLASS_INCLUDE_DIR
}
;
${
CUTLASS_TOOLS_UTIL_INCLUDE_DIR
}
USE_SABI 3
WITH_SOABI
)
WITH_SOABI
)
#
#
...
@@ -226,7 +214,7 @@ define_gpu_extension_target(
...
@@ -226,7 +214,7 @@ define_gpu_extension_target(
#
#
set
(
VLLM_MOE_EXT_SRC
set
(
VLLM_MOE_EXT_SRC
"csrc/moe/
moe_op
s.cpp"
"csrc/moe/
torch_binding
s.cpp"
"csrc/moe/topk_softmax_kernels.cu"
)
"csrc/moe/topk_softmax_kernels.cu"
)
define_gpu_extension_target
(
define_gpu_extension_target
(
...
@@ -236,6 +224,7 @@ define_gpu_extension_target(
...
@@ -236,6 +224,7 @@ define_gpu_extension_target(
SOURCES
${
VLLM_MOE_EXT_SRC
}
SOURCES
${
VLLM_MOE_EXT_SRC
}
COMPILE_FLAGS
${
VLLM_GPU_FLAGS
}
COMPILE_FLAGS
${
VLLM_GPU_FLAGS
}
ARCHITECTURES
${
VLLM_GPU_ARCHES
}
ARCHITECTURES
${
VLLM_GPU_ARCHES
}
USE_SABI 3
WITH_SOABI
)
WITH_SOABI
)
#
#
...
@@ -250,7 +239,7 @@ set(VLLM_PUNICA_EXT_SRC
...
@@ -250,7 +239,7 @@ set(VLLM_PUNICA_EXT_SRC
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
"csrc/punica/punica_ops.cu"
"csrc/punica/punica_ops.cu"
"csrc/punica/
punica_pybind
.cpp"
)
"csrc/punica/
torch_bindings
.cpp"
)
#
#
# Copy GPU compilation flags+update for punica
# Copy GPU compilation flags+update for punica
...
@@ -287,6 +276,7 @@ if (VLLM_PUNICA_GPU_ARCHES)
...
@@ -287,6 +276,7 @@ if (VLLM_PUNICA_GPU_ARCHES)
SOURCES
${
VLLM_PUNICA_EXT_SRC
}
SOURCES
${
VLLM_PUNICA_EXT_SRC
}
COMPILE_FLAGS
${
VLLM_PUNICA_GPU_FLAGS
}
COMPILE_FLAGS
${
VLLM_PUNICA_GPU_FLAGS
}
ARCHITECTURES
${
VLLM_PUNICA_GPU_ARCHES
}
ARCHITECTURES
${
VLLM_PUNICA_GPU_ARCHES
}
USE_SABI 3
WITH_SOABI
)
WITH_SOABI
)
else
()
else
()
message
(
WARNING
"Unable to create _punica_C target because none of the "
message
(
WARNING
"Unable to create _punica_C target because none of the "
...
@@ -312,6 +302,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
...
@@ -312,6 +302,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
message
(
STATUS
"Enabling C extension."
)
message
(
STATUS
"Enabling C extension."
)
add_dependencies
(
default _C
)
add_dependencies
(
default _C
)
message
(
STATUS
"Enabling moe extension."
)
add_dependencies
(
default _moe_C
)
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
# there are supported target arches.
# there are supported target arches.
...
@@ -321,8 +314,3 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
...
@@ -321,8 +314,3 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
add_dependencies
(
default _punica_C
)
add_dependencies
(
default _punica_C
)
endif
()
endif
()
endif
()
endif
()
if
(
VLLM_GPU_LANG STREQUAL
"CUDA"
)
message
(
STATUS
"Enabling moe extension."
)
add_dependencies
(
default _moe_C
)
endif
()
Dockerfile.cpu
View file @
f48954a4
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
FROM ubuntu:22.04
FROM ubuntu:22.04
AS cpu-test-1
RUN apt-get update -y \
RUN apt-get update -y \
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
RUN pip install --upgrade pip \
RUN pip install --upgrade pip \
&& pip install wheel packaging ninja setuptools>=49.4.0 numpy
&& pip install wheel packaging ninja "setuptools>=49.4.0" numpy
FROM cpu-test-1 AS build
COPY ./ /workspace/vllm
COPY ./ /workspace/vllm
...
@@ -19,4 +21,6 @@ RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
...
@@ -19,4 +21,6 @@ RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
WORKDIR /workspace/
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
CMD ["/bin/bash"]
CMD ["/bin/bash"]
Dockerfile.neuron
View file @
f48954a4
...
@@ -28,7 +28,7 @@ COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
...
@@ -28,7 +28,7 @@ COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
RUN cd /app/vllm \
RUN cd /app/vllm \
&& python3 -m pip install -U -r requirements-neuron.txt
&& python3 -m pip install -U -r requirements-neuron.txt
ENV VLLM_
BUILD_WITH_NEURON 1
ENV VLLM_
TARGET_DEVICE neuron
RUN cd /app/vllm \
RUN cd /app/vllm \
&& pip install -e . \
&& pip install -e . \
&& cd ..
&& cd ..
...
...
Dockerfile.rocm
View file @
f48954a4
...
@@ -106,8 +106,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
...
@@ -106,8 +106,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install -U -r requirements-rocm.txt \
pip install -U -r requirements-rocm.txt \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
&& python3 setup.py install \
&& python3 setup.py install \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \
&& cd ..
&& cd ..
...
...
README_ORIGIN.md
View file @
f48954a4
...
@@ -16,6 +16,13 @@ Easy, fast, and cheap LLM serving for everyone
...
@@ -16,6 +16,13 @@ Easy, fast, and cheap LLM serving for everyone
---
---
**Ray Summit CPF is Open (June 4th to June 20th)!**
There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
This will be a great chance for everyone in the community to get together and learn.
Please submit your proposal
[
here
](
https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite
)
**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
We are thrilled to announce our fourth vLLM Meetup!
We are thrilled to announce our fourth vLLM Meetup!
...
@@ -107,6 +114,7 @@ vLLM is a community project. Our compute resources for development and testing a
...
@@ -107,6 +114,7 @@ vLLM is a community project. Our compute resources for development and testing a
-
Replicate
-
Replicate
-
Roblox
-
Roblox
-
RunPod
-
RunPod
-
Sequoia Capital
-
Trainy
-
Trainy
-
UC Berkeley
-
UC Berkeley
-
UC San Diego
-
UC San Diego
...
...
benchmarks/benchmark_latency.py
View file @
f48954a4
...
@@ -36,7 +36,8 @@ def main(args: argparse.Namespace):
...
@@ -36,7 +36,8 @@ def main(args: argparse.Namespace):
enable_chunked_prefill
=
args
.
enable_chunked_prefill
,
enable_chunked_prefill
=
args
.
enable_chunked_prefill
,
download_dir
=
args
.
download_dir
,
download_dir
=
args
.
download_dir
,
block_size
=
args
.
block_size
,
block_size
=
args
.
block_size
,
gpu_memory_utilization
=
args
.
gpu_memory_utilization
)
gpu_memory_utilization
=
args
.
gpu_memory_utilization
,
distributed_executor_backend
=
args
.
distributed_executor_backend
)
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
n
=
args
.
n
,
n
=
args
.
n
,
...
@@ -221,5 +222,12 @@ if __name__ == '__main__':
...
@@ -221,5 +222,12 @@ if __name__ == '__main__':
help
=
'the fraction of GPU memory to be used for '
help
=
'the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.'
)
'If unspecified, will use the default value of 0.9.'
)
parser
.
add_argument
(
'--distributed-executor-backend'
,
choices
=
[
'ray'
,
'mp'
],
default
=
None
,
help
=
'Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
main
(
args
)
main
(
args
)
benchmarks/benchmark_serving.py
View file @
f48954a4
...
@@ -56,6 +56,9 @@ class BenchmarkMetrics:
...
@@ -56,6 +56,9 @@ class BenchmarkMetrics:
mean_tpot_ms
:
float
mean_tpot_ms
:
float
median_tpot_ms
:
float
median_tpot_ms
:
float
p99_tpot_ms
:
float
p99_tpot_ms
:
float
mean_itl_ms
:
float
median_itl_ms
:
float
p99_itl_ms
:
float
def
sample_sharegpt_requests
(
def
sample_sharegpt_requests
(
...
@@ -200,16 +203,24 @@ def calculate_metrics(
...
@@ -200,16 +203,24 @@ def calculate_metrics(
actual_output_lens
=
[]
actual_output_lens
=
[]
total_input
=
0
total_input
=
0
completed
=
0
completed
=
0
itls
=
[]
tpots
=
[]
tpots
=
[]
ttfts
=
[]
ttfts
=
[]
for
i
in
range
(
len
(
outputs
)):
for
i
in
range
(
len
(
outputs
)):
if
outputs
[
i
].
success
:
if
outputs
[
i
].
success
:
output_len
=
len
(
tokenizer
(
outputs
[
i
].
generated_text
).
input_ids
)
# We use the tokenizer to count the number of output tokens for all
# serving backends instead of looking at len(outputs[i].itl) since
# multiple output tokens may be bundled together
# Note: this may inflate the output token count slightly
output_len
=
len
(
tokenizer
(
outputs
[
i
].
generated_text
,
add_special_tokens
=
False
).
input_ids
)
actual_output_lens
.
append
(
output_len
)
actual_output_lens
.
append
(
output_len
)
total_input
+=
input_requests
[
i
][
1
]
total_input
+=
input_requests
[
i
][
1
]
if
output_len
>
1
:
if
output_len
>
1
:
tpots
.
append
(
tpots
.
append
(
(
outputs
[
i
].
latency
-
outputs
[
i
].
ttft
)
/
(
output_len
-
1
))
(
outputs
[
i
].
latency
-
outputs
[
i
].
ttft
)
/
(
output_len
-
1
))
itls
+=
outputs
[
i
].
itl
ttfts
.
append
(
outputs
[
i
].
ttft
)
ttfts
.
append
(
outputs
[
i
].
ttft
)
completed
+=
1
completed
+=
1
else
:
else
:
...
@@ -234,6 +245,9 @@ def calculate_metrics(
...
@@ -234,6 +245,9 @@ def calculate_metrics(
mean_tpot_ms
=
np
.
mean
(
tpots
or
0
)
*
1000
,
mean_tpot_ms
=
np
.
mean
(
tpots
or
0
)
*
1000
,
median_tpot_ms
=
np
.
median
(
tpots
or
0
)
*
1000
,
median_tpot_ms
=
np
.
median
(
tpots
or
0
)
*
1000
,
p99_tpot_ms
=
np
.
percentile
(
tpots
or
0
,
99
)
*
1000
,
p99_tpot_ms
=
np
.
percentile
(
tpots
or
0
,
99
)
*
1000
,
mean_itl_ms
=
np
.
mean
(
itls
or
0
)
*
1000
,
median_itl_ms
=
np
.
median
(
itls
or
0
)
*
1000
,
p99_itl_ms
=
np
.
percentile
(
itls
or
0
,
99
)
*
1000
,
)
)
return
metrics
,
actual_output_lens
return
metrics
,
actual_output_lens
...
@@ -333,6 +347,10 @@ async def benchmark(
...
@@ -333,6 +347,10 @@ async def benchmark(
print
(
"{:<40} {:<10.2f}"
.
format
(
"Median TPOT (ms):"
,
print
(
"{:<40} {:<10.2f}"
.
format
(
"Median TPOT (ms):"
,
metrics
.
median_tpot_ms
))
metrics
.
median_tpot_ms
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"P99 TPOT (ms):"
,
metrics
.
p99_tpot_ms
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"P99 TPOT (ms):"
,
metrics
.
p99_tpot_ms
))
print
(
"{s:{c}^{n}}"
.
format
(
s
=
'Inter-token Latency'
,
n
=
50
,
c
=
'-'
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Mean ITL (ms):"
,
metrics
.
mean_itl_ms
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Median ITL (ms):"
,
metrics
.
median_itl_ms
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"P99 ITL (ms):"
,
metrics
.
p99_itl_ms
))
print
(
"="
*
50
)
print
(
"="
*
50
)
result
=
{
result
=
{
...
@@ -349,6 +367,9 @@ async def benchmark(
...
@@ -349,6 +367,9 @@ async def benchmark(
"mean_tpot_ms"
:
metrics
.
mean_tpot_ms
,
"mean_tpot_ms"
:
metrics
.
mean_tpot_ms
,
"median_tpot_ms"
:
metrics
.
median_tpot_ms
,
"median_tpot_ms"
:
metrics
.
median_tpot_ms
,
"p99_tpot_ms"
:
metrics
.
p99_tpot_ms
,
"p99_tpot_ms"
:
metrics
.
p99_tpot_ms
,
"mean_itl_ms"
:
metrics
.
mean_itl_ms
,
"median_itl_ms"
:
metrics
.
median_itl_ms
,
"p99_itl_ms"
:
metrics
.
p99_itl_ms
,
"input_lens"
:
[
output
.
prompt_len
for
output
in
outputs
],
"input_lens"
:
[
output
.
prompt_len
for
output
in
outputs
],
"output_lens"
:
actual_output_lens
,
"output_lens"
:
actual_output_lens
,
"ttfts"
:
[
output
.
ttft
for
output
in
outputs
],
"ttfts"
:
[
output
.
ttft
for
output
in
outputs
],
...
...
benchmarks/benchmark_throughput.py
View file @
f48954a4
...
@@ -78,6 +78,7 @@ def run_vllm(
...
@@ -78,6 +78,7 @@ def run_vllm(
enable_prefix_caching
:
bool
,
enable_prefix_caching
:
bool
,
enable_chunked_prefill
:
bool
,
enable_chunked_prefill
:
bool
,
max_num_batched_tokens
:
int
,
max_num_batched_tokens
:
int
,
distributed_executor_backend
:
Optional
[
str
],
gpu_memory_utilization
:
float
=
0.9
,
gpu_memory_utilization
:
float
=
0.9
,
download_dir
:
Optional
[
str
]
=
None
,
download_dir
:
Optional
[
str
]
=
None
,
)
->
float
:
)
->
float
:
...
@@ -100,6 +101,7 @@ def run_vllm(
...
@@ -100,6 +101,7 @@ def run_vllm(
download_dir
=
download_dir
,
download_dir
=
download_dir
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
)
)
# Add the requests to the engine.
# Add the requests to the engine.
...
@@ -225,8 +227,8 @@ def main(args: argparse.Namespace):
...
@@ -225,8 +227,8 @@ def main(args: argparse.Namespace):
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
quantization_param_path
,
args
.
device
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
gpu_memory_utilization
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
download_dir
)
args
.
gpu_memory_utilization
,
args
.
download_dir
)
elif
args
.
backend
==
"hf"
:
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
assert
args
.
tensor_parallel_size
==
1
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
...
@@ -375,6 +377,13 @@ if __name__ == "__main__":
...
@@ -375,6 +377,13 @@ if __name__ == "__main__":
type
=
str
,
type
=
str
,
default
=
None
,
default
=
None
,
help
=
'Path to save the throughput results in JSON format.'
)
help
=
'Path to save the throughput results in JSON format.'
)
parser
.
add_argument
(
'--distributed-executor-backend'
,
choices
=
[
'ray'
,
'mp'
],
default
=
None
,
help
=
'Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
args
.
tokenizer
=
args
.
model
...
...
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
0 → 100644
View file @
f48954a4
import
argparse
import
copy
import
itertools
import
pickle
as
pkl
import
time
from
typing
import
Callable
,
Iterable
,
List
,
Tuple
import
torch
import
torch.utils.benchmark
as
TBenchmark
from
torch.utils.benchmark
import
Measurement
as
TMeasurement
from
weight_shapes
import
WEIGHT_SHAPES
from
vllm
import
_custom_ops
as
ops
DEFAULT_MODELS
=
list
(
WEIGHT_SHAPES
.
keys
())[
1
:]
DEFAULT_BATCH_SIZES
=
[
1
,
16
,
32
,
64
,
128
,
256
,
512
]
DEFAULT_TP_SIZES
=
[
1
]
# helpers
def
to_fp8
(
tensor
:
torch
.
tensor
)
->
torch
.
tensor
:
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
return
torch
.
round
(
tensor
.
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)).
to
(
dtype
=
torch
.
float8_e4m3fn
)
def
to_int8
(
tensor
:
torch
.
tensor
)
->
torch
.
tensor
:
return
torch
.
round
(
tensor
.
clamp
(
min
=-
128
,
max
=
127
)).
to
(
dtype
=
torch
.
int8
)
def
make_rand_tensors
(
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
Tuple
[
torch
.
tensor
,
torch
.
tensor
]:
a
=
torch
.
randn
((
m
,
k
),
device
=
'cuda'
)
*
5
b
=
torch
.
randn
((
n
,
k
),
device
=
'cuda'
).
t
()
*
5
if
dtype
==
torch
.
int8
:
return
to_int8
(
a
),
to_int8
(
b
)
if
dtype
==
torch
.
float8_e4m3fn
:
return
to_fp8
(
a
),
to_fp8
(
b
)
raise
ValueError
(
"unsupported dtype"
)
# impl
def
pytorch_i8_impl
(
a
:
torch
.
tensor
,
b
:
torch
.
tensor
,
scale_a
:
torch
.
tensor
,
scale_b
:
torch
.
tensor
,
out_dtype
:
torch
.
dtype
)
->
torch
.
tensor
:
return
torch
.
mm
(
a
,
b
)
def
pytorch_fp8_impl
(
a
:
torch
.
tensor
,
b
:
torch
.
tensor
,
scale_a
:
torch
.
tensor
,
scale_b
:
torch
.
tensor
,
out_dtype
:
torch
.
dtype
)
->
torch
.
tensor
:
return
torch
.
_scaled_mm
(
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
out_dtype
)
def
pytorch_fp8_impl_fast_accum
(
a
:
torch
.
tensor
,
b
:
torch
.
tensor
,
scale_a
:
torch
.
tensor
,
scale_b
:
torch
.
tensor
,
out_dtype
:
torch
.
dtype
)
->
torch
.
tensor
:
return
torch
.
_scaled_mm
(
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
out_dtype
,
use_fast_accum
=
True
)
def
cutlass_impl
(
a
:
torch
.
tensor
,
b
:
torch
.
tensor
,
scale_a
:
torch
.
tensor
,
scale_b
:
torch
.
tensor
,
out_dtype
:
torch
.
dtype
)
->
torch
.
tensor
:
return
ops
.
cutlass_scaled_mm_dq
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
=
out_dtype
)
# bench
def
bench_fn
(
a
:
torch
.
tensor
,
b
:
torch
.
tensor
,
scale_a
:
torch
.
tensor
,
scale_b
:
torch
.
tensor
,
out_dtype
:
torch
.
dtype
,
label
:
str
,
sub_label
:
str
,
fn
:
Callable
,
description
:
str
)
->
TMeasurement
:
min_run_time
=
1
globals
=
{
"a"
:
a
,
"b"
:
b
,
"scale_a"
:
scale_a
,
"scale_b"
:
scale_b
,
"out_dtype"
:
out_dtype
,
"fn"
:
fn
,
}
return
TBenchmark
.
Timer
(
stmt
=
"fn(a, b, scale_a, scale_b, out_dtype)"
,
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
description
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
def
bench_int8
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
)
->
Iterable
[
TMeasurement
]:
assert
dtype
==
torch
.
int8
a
,
b
=
make_rand_tensors
(
torch
.
int8
,
m
,
n
,
k
)
scale_a
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
timers
=
[]
# pytorch impl
timers
.
append
(
bench_fn
(
a
.
to
(
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
),
b
.
to
(
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
),
scale_a
,
scale_b
,
torch
.
bfloat16
,
label
,
sub_label
,
pytorch_i8_impl
,
"pytorch_bf16_bf16_bf16_matmul-no-scales"
))
# cutlass impl
timers
.
append
(
bench_fn
(
a
,
b
,
scale_a
.
to
(
device
=
"cpu"
),
scale_b
.
to
(
device
=
"cpu"
),
torch
.
bfloat16
,
label
,
sub_label
,
cutlass_impl
,
"cutlass_i8_i8_bf16_scaled_mm"
))
return
timers
def
bench_fp8
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
)
->
Iterable
[
TMeasurement
]:
assert
dtype
==
torch
.
float8_e4m3fn
a
,
b
=
make_rand_tensors
(
torch
.
float8_e4m3fn
,
m
,
n
,
k
)
scale_a
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
timers
=
[]
# pytorch impl: bf16 output, without fp8 fast accum
timers
.
append
(
bench_fn
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
label
,
sub_label
,
pytorch_fp8_impl
,
"pytorch_fp8_fp8_bf16_scaled_mm"
))
# pytorch impl: bf16 output, with fp8 fast accum
timers
.
append
(
bench_fn
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
label
,
sub_label
,
pytorch_fp8_impl_fast_accum
,
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"
))
# pytorch impl: fp16 output, without fp8 fast accum
timers
.
append
(
bench_fn
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
float16
,
label
,
sub_label
,
pytorch_fp8_impl
,
"pytorch_fp8_fp8_fp16_scaled_mm"
))
# pytorch impl: fp16 output, with fp8 fast accum
timers
.
append
(
bench_fn
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
float16
,
label
,
sub_label
,
pytorch_fp8_impl_fast_accum
,
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"
))
# cutlass impl: bf16 output
timers
.
append
(
bench_fn
(
a
,
b
,
scale_a
.
to
(
device
=
"cpu"
),
scale_b
.
to
(
device
=
"cpu"
),
torch
.
bfloat16
,
label
,
sub_label
,
cutlass_impl
,
"cutlass_fp8_fp8_bf16_scaled_mm"
))
# cutlass impl: fp16 output
timers
.
append
(
bench_fn
(
a
,
b
,
scale_a
.
to
(
device
=
"cpu"
),
scale_b
.
to
(
device
=
"cpu"
),
torch
.
float16
,
label
,
sub_label
,
cutlass_impl
,
"cutlass_fp8_fp8_fp16_scaled_mm"
))
return
timers
def
bench
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
)
->
Iterable
[
TMeasurement
]:
if
dtype
==
torch
.
int8
:
return
bench_int8
(
dtype
,
m
,
k
,
n
,
label
,
sub_label
)
if
dtype
==
torch
.
float8_e4m3fn
:
return
bench_fp8
(
dtype
,
m
,
k
,
n
,
label
,
sub_label
)
raise
ValueError
(
"unsupported type"
)
# runner
def
print_timers
(
timers
:
Iterable
[
TMeasurement
]):
compare
=
TBenchmark
.
Compare
(
timers
)
compare
.
print
()
def
run
(
dtype
:
torch
.
dtype
,
MKNs
:
Iterable
[
Tuple
[
int
,
int
,
int
]])
->
Iterable
[
TMeasurement
]:
results
=
[]
for
m
,
k
,
n
in
MKNs
:
timers
=
bench
(
dtype
,
m
,
k
,
n
,
f
"scaled-
{
dtype
}
-gemm"
,
f
"MKN=(
{
m
}
x
{
k
}
x
{
n
}
)"
)
print_timers
(
timers
)
results
.
extend
(
timers
)
return
results
# output makers
def
make_output
(
data
:
Iterable
[
TMeasurement
],
MKNs
:
Iterable
[
Tuple
[
int
,
int
,
int
]],
base_description
:
str
,
timestamp
=
None
):
print
(
f
"== All Results
{
base_description
}
===="
)
print_timers
(
data
)
# pickle all the results
timestamp
=
int
(
time
.
time
())
if
timestamp
is
None
else
timestamp
with
open
(
f
"
{
base_description
}
-
{
timestamp
}
.pkl"
,
"wb"
)
as
f
:
pkl
.
dump
(
data
,
f
)
# argparse runners
def
run_square_bench
(
args
):
dim_sizes
=
list
(
range
(
args
.
dim_start
,
args
.
dim_end
+
1
,
args
.
dim_increment
))
MKNs
=
list
(
zip
(
dim_sizes
,
dim_sizes
,
dim_sizes
))
data
=
run
(
args
.
dtype
,
MKNs
)
make_output
(
data
,
MKNs
,
f
"square_bench-
{
args
.
dtype
}
"
)
def
run_range_bench
(
args
):
dim_sizes
=
list
(
range
(
args
.
dim_start
,
args
.
dim_end
,
args
.
dim_increment
))
n
=
len
(
dim_sizes
)
Ms
=
[
args
.
m_constant
]
*
n
if
args
.
m_constant
is
not
None
else
dim_sizes
Ks
=
[
args
.
k_constant
]
*
n
if
args
.
k_constant
is
not
None
else
dim_sizes
Ns
=
[
args
.
n_constant
]
*
n
if
args
.
n_constant
is
not
None
else
dim_sizes
MKNs
=
list
(
zip
(
Ms
,
Ks
,
Ns
))
data
=
run
(
args
.
dtype
,
MKNs
)
make_output
(
data
,
MKNs
,
f
"range_bench-
{
args
.
dtype
}
"
)
def
run_model_bench
(
args
):
print
(
"Benchmarking models:"
)
for
i
,
model
in
enumerate
(
args
.
models
):
print
(
f
"[
{
i
}
]
{
model
}
"
)
def
model_shapes
(
model_name
:
str
,
tp_size
:
int
)
->
List
[
Tuple
[
int
,
int
]]:
KNs
=
[]
for
KN
,
tp_split_dim
in
copy
.
deepcopy
(
WEIGHT_SHAPES
[
model_name
]):
KN
[
tp_split_dim
]
=
KN
[
tp_split_dim
]
//
tp_size
KNs
.
append
(
KN
)
return
KNs
model_bench_data
=
[]
models_tps
=
list
(
itertools
.
product
(
args
.
models
,
args
.
tp_sizes
))
for
model
,
tp_size
in
models_tps
:
Ms
=
args
.
batch_sizes
KNs
=
model_shapes
(
model
,
tp_size
)
MKNs
=
[]
for
m
in
Ms
:
for
k
,
n
in
KNs
:
MKNs
.
append
((
m
,
k
,
n
))
data
=
run
(
args
.
dtype
,
MKNs
)
model_bench_data
.
append
(
data
)
# Print all results
for
data
,
model_tp
in
zip
(
model_bench_data
,
models_tps
):
model
,
tp_size
=
model_tp
print
(
f
"== Results
{
args
.
dtype
}
{
model
}
-TP
{
tp_size
}
===="
)
print_timers
(
data
)
timestamp
=
int
(
time
.
time
())
all_data
=
[]
for
d
in
model_bench_data
:
all_data
.
extend
(
d
)
# pickle all data
with
open
(
f
"model_bench-
{
args
.
dtype
}
-
{
timestamp
}
.pkl"
,
"wb"
)
as
f
:
pkl
.
dump
(
all_data
,
f
)
if
__name__
==
'__main__'
:
def
to_torch_dtype
(
dt
):
if
dt
==
"int8"
:
return
torch
.
int8
if
dt
==
"fp8"
:
return
torch
.
float8_e4m3fn
raise
ValueError
(
"unsupported dtype"
)
parser
=
argparse
.
ArgumentParser
(
description
=
"""
Benchmark Cutlass GEMM.
To run square GEMMs:
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
To run constant N and K and sweep M:
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
To run dimensions from a model:
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
Output:
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
"""
,
# noqa: E501
formatter_class
=
argparse
.
RawTextHelpFormatter
)
parser
.
add_argument
(
"--dtype"
,
type
=
to_torch_dtype
,
required
=
True
,
help
=
"Available options are ['int8', 'fp8']"
)
subparsers
=
parser
.
add_subparsers
(
dest
=
"cmd"
)
square_parser
=
subparsers
.
add_parser
(
"square_bench"
)
square_parser
.
add_argument
(
"--dim-start"
,
type
=
int
,
required
=
True
)
square_parser
.
add_argument
(
"--dim-end"
,
type
=
int
,
required
=
True
)
square_parser
.
add_argument
(
"--dim-increment"
,
type
=
int
,
required
=
True
)
square_parser
.
set_defaults
(
func
=
run_square_bench
)
range_parser
=
subparsers
.
add_parser
(
"range_bench"
)
range_parser
.
add_argument
(
"--dim-start"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--dim-end"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--dim-increment"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--m-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
add_argument
(
"--n-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
add_argument
(
"--k-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
set_defaults
(
func
=
run_range_bench
)
model_parser
=
subparsers
.
add_parser
(
"model_bench"
)
model_parser
.
add_argument
(
"--models"
,
nargs
=
"+"
,
type
=
str
,
default
=
DEFAULT_MODELS
,
choices
=
WEIGHT_SHAPES
.
keys
())
model_parser
.
add_argument
(
"--tp-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_TP_SIZES
)
model_parser
.
add_argument
(
"--batch-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_BATCH_SIZES
)
model_parser
.
set_defaults
(
func
=
run_model_bench
)
args
=
parser
.
parse_args
()
args
.
func
(
args
)
benchmarks/cutlass_benchmarks/weight_shapes.py
0 → 100644
View file @
f48954a4
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)
# Example:
# A shape of ([14336, 4096], 0) indicates the following GEMM shape,
# - TP1 : K = 14336, N = 4096
# - TP2 : K = 7168, N = 4096
# A shape of ([4096, 6144], 1) indicates the following GEMM shape,
# - TP1 : K = 4096, N = 6144
# - TP4 : K = 4096, N = 1536
# TP1 shapes
WEIGHT_SHAPES
=
{
"mistralai/Mistral-7B-v0.1"
:
[
([
4096
,
6144
],
1
),
([
4096
,
4096
],
0
),
([
4096
,
28672
],
1
),
([
14336
,
4096
],
0
),
],
"meta-llama/Llama-2-7b-hf"
:
[
([
4096
,
12288
],
1
),
([
4096
,
4096
],
0
),
([
4096
,
22016
],
1
),
([
11008
,
4096
],
0
),
],
"meta-llama/Llama-2-13b-hf"
:
[
([
5120
,
15360
],
1
),
([
5120
,
5120
],
0
),
([
5120
,
27648
],
1
),
([
13824
,
5120
],
0
),
],
"meta-llama/Llama-2-70b-hf"
:
[
([
8192
,
10240
],
1
),
([
8192
,
8192
],
0
),
([
8192
,
57344
],
1
),
([
28672
,
8192
],
0
),
],
}
benchmarks/kernels/benchmark_mixtral_moe.py
deleted
100644 → 0
View file @
1dba29d3
import
argparse
import
json
import
os
import
sys
import
torch
import
torch.nn.functional
as
F
import
triton
from
tqdm
import
tqdm
from
vllm.model_executor.layers.fused_moe
import
(
fused_moe
,
get_config_file_name
)
os
.
environ
[
'CUDA_VISIBLE_DEVICES'
]
=
'0'
def
main
(
dtype
:
str
):
method
=
fused_moe
for
bs
in
[
1
,
2
,
4
,
8
,
16
,
24
,
32
,
48
,
64
,
96
,
128
,
256
,
512
,
1024
,
1536
,
2048
,
3072
,
4096
]:
run_grid
(
bs
,
method
=
method
,
dtype
=
dtype
)
def
run_grid
(
bs
,
method
,
dtype
:
str
):
d_model
=
4096
num_total_experts
=
8
top_k
=
2
tp_size
=
2
model_intermediate_size
=
14336
num_layers
=
32
num_calls
=
100
num_warmup_trials
=
1
num_trials
=
1
configs
=
[]
for
block_size_n
in
[
32
,
64
,
128
,
256
]:
for
block_size_m
in
[
16
,
32
,
64
,
128
,
256
]:
for
block_size_k
in
[
64
,
128
,
256
]:
for
group_size_m
in
[
1
,
16
,
32
,
64
]:
for
num_warps
in
[
4
,
8
]:
for
num_stages
in
[
2
,
3
,
4
,
5
]:
configs
.
append
({
"BLOCK_SIZE_M"
:
block_size_m
,
"BLOCK_SIZE_N"
:
block_size_n
,
"BLOCK_SIZE_K"
:
block_size_k
,
"GROUP_SIZE_M"
:
group_size_m
,
"num_warps"
:
num_warps
,
"num_stages"
:
num_stages
,
})
best_config
=
None
best_time_us
=
1e20
print
(
f
'
{
tp_size
=
}
{
bs
=
}
'
)
for
config
in
tqdm
(
configs
):
# warmup
try
:
for
_
in
range
(
num_warmup_trials
):
run_timing
(
num_calls
=
num_calls
,
bs
=
bs
,
d_model
=
d_model
,
num_total_experts
=
num_total_experts
,
top_k
=
top_k
,
tp_size
=
tp_size
,
model_intermediate_size
=
model_intermediate_size
,
method
=
method
,
config
=
config
,
dtype
=
dtype
,
)
except
triton
.
runtime
.
autotuner
.
OutOfResources
:
continue
# trial
for
_
in
range
(
num_trials
):
kernel_dur_ms
=
run_timing
(
num_calls
=
num_calls
,
bs
=
bs
,
d_model
=
d_model
,
num_total_experts
=
num_total_experts
,
top_k
=
top_k
,
tp_size
=
tp_size
,
model_intermediate_size
=
model_intermediate_size
,
method
=
method
,
config
=
config
,
dtype
=
dtype
,
)
kernel_dur_us
=
1000
*
kernel_dur_ms
model_dur_ms
=
kernel_dur_ms
*
num_layers
if
kernel_dur_us
<
best_time_us
:
best_config
=
config
best_time_us
=
kernel_dur_us
tqdm
.
write
(
f
'
{
kernel_dur_us
=
:.
1
f
}
{
model_dur_ms
=
:.
1
f
}
'
f
'
{
bs
=
}
{
tp_size
=
}
{
top_k
=
}
{
num_total_experts
=
}
'
f
'
{
d_model
=
}
{
model_intermediate_size
=
}
{
num_layers
=
}
'
)
print
(
"best_time_us"
,
best_time_us
)
print
(
"best_config"
,
best_config
)
# holds Dict[str, Dict[str, int]]
filename
=
get_config_file_name
(
num_total_experts
,
model_intermediate_size
//
tp_size
,
"float8"
if
dtype
==
"float8"
else
None
)
print
(
f
"writing config to file
{
filename
}
"
)
existing_content
=
{}
if
os
.
path
.
exists
(
filename
):
with
open
(
filename
,
"r"
)
as
f
:
existing_content
=
json
.
load
(
f
)
existing_content
[
str
(
bs
)]
=
best_config
with
open
(
filename
,
"w"
)
as
f
:
json
.
dump
(
existing_content
,
f
,
indent
=
4
)
f
.
write
(
"
\n
"
)
def
run_timing
(
num_calls
:
int
,
bs
:
int
,
d_model
:
int
,
num_total_experts
:
int
,
top_k
:
int
,
tp_size
:
int
,
model_intermediate_size
:
int
,
method
,
config
,
dtype
:
str
)
->
float
:
shard_intermediate_size
=
model_intermediate_size
//
tp_size
hidden_states
=
torch
.
rand
(
(
bs
,
d_model
),
device
=
"cuda:0"
,
dtype
=
torch
.
float16
,
)
w1
=
torch
.
rand
(
(
num_total_experts
,
2
*
shard_intermediate_size
,
d_model
),
device
=
hidden_states
.
device
,
dtype
=
hidden_states
.
dtype
,
)
w2
=
torch
.
rand
(
(
num_total_experts
,
d_model
,
shard_intermediate_size
),
device
=
hidden_states
.
device
,
dtype
=
hidden_states
.
dtype
,
)
w1_scale
=
None
w2_scale
=
None
a1_scale
=
None
a2_scale
=
None
if
dtype
==
"float8"
:
w1
=
w1
.
to
(
torch
.
float8_e4m3fn
)
w2
=
w2
.
to
(
torch
.
float8_e4m3fn
)
w1_scale
=
torch
.
ones
(
num_total_experts
,
device
=
hidden_states
.
device
,
dtype
=
torch
.
float32
)
w2_scale
=
torch
.
ones
(
num_total_experts
,
device
=
hidden_states
.
device
,
dtype
=
torch
.
float32
)
a1_scale
=
torch
.
ones
(
1
,
device
=
hidden_states
.
device
,
dtype
=
torch
.
float32
)
a2_scale
=
torch
.
ones
(
1
,
device
=
hidden_states
.
device
,
dtype
=
torch
.
float32
)
gating_output
=
F
.
softmax
(
torch
.
rand
(
(
num_calls
,
bs
,
num_total_experts
),
device
=
hidden_states
.
device
,
dtype
=
torch
.
float32
,
),
dim
=-
1
)
start_event
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
end_event
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
start_event
.
record
()
for
i
in
range
(
num_calls
):
hidden_states
=
method
(
hidden_states
=
hidden_states
,
w1
=
w1
,
w2
=
w2
,
w1_scale
=
w1_scale
,
w2_scale
=
w2_scale
,
a1_scale
=
a1_scale
,
a2_scale
=
a2_scale
,
gating_output
=
gating_output
[
i
],
topk
=
2
,
renormalize
=
True
,
inplace
=
True
,
override_config
=
config
,
use_fp8
=
dtype
==
"float8"
,
)
end_event
.
record
()
end_event
.
synchronize
()
dur_ms
=
start_event
.
elapsed_time
(
end_event
)
/
num_calls
return
dur_ms
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
prog
=
'benchmark_mixtral_moe'
,
description
=
'Benchmark and tune the fused_moe kernel'
,
)
parser
.
add_argument
(
'--dtype'
,
type
=
str
,
default
=
'auto'
,
choices
=
[
'float8'
,
'float16'
],
help
=
'Data type used for fused_moe kernel computations'
,
)
args
=
parser
.
parse_args
()
sys
.
exit
(
main
(
args
.
dtype
))
benchmarks/kernels/benchmark_moe.py
0 → 100644
View file @
f48954a4
import
argparse
import
time
from
datetime
import
datetime
from
typing
import
Any
,
Dict
,
List
,
Tuple
import
ray
import
torch
import
triton
from
ray.experimental.tqdm_ray
import
tqdm
from
transformers
import
AutoConfig
from
vllm.model_executor.layers.fused_moe.fused_moe
import
*
def
benchmark_config
(
config
:
Dict
[
str
,
int
],
num_tokens
:
int
,
num_experts
:
int
,
shard_intermediate_size
:
int
,
hidden_size
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
use_fp8
:
bool
,
num_iters
:
int
=
100
,
)
->
float
:
init_dtype
=
torch
.
float16
if
use_fp8
else
dtype
x
=
torch
.
randn
(
num_tokens
,
hidden_size
,
dtype
=
dtype
)
w1
=
torch
.
randn
(
num_experts
,
shard_intermediate_size
,
hidden_size
,
dtype
=
init_dtype
)
w2
=
torch
.
randn
(
num_experts
,
hidden_size
,
shard_intermediate_size
//
2
,
dtype
=
init_dtype
)
gating_output
=
torch
.
randn
(
num_iters
,
num_tokens
,
num_experts
,
dtype
=
torch
.
float32
)
w1_scale
=
None
w2_scale
=
None
a1_scale
=
None
a2_scale
=
None
if
use_fp8
:
w1_scale
=
torch
.
randn
(
num_experts
,
dtype
=
torch
.
float32
)
w2_scale
=
torch
.
randn
(
num_experts
,
dtype
=
torch
.
float32
)
a1_scale
=
torch
.
randn
(
1
,
dtype
=
torch
.
float32
)
a2_scale
=
torch
.
randn
(
1
,
dtype
=
torch
.
float32
)
w1
=
w1
.
to
(
torch
.
float8_e4m3fn
)
w2
=
w2
.
to
(
torch
.
float8_e4m3fn
)
input_gating
=
torch
.
empty
(
num_tokens
,
num_experts
,
dtype
=
torch
.
float32
)
def
prepare
(
i
:
int
):
input_gating
.
copy_
(
gating_output
[
i
])
def
run
():
fused_moe
(
x
,
w1
,
w2
,
input_gating
,
topk
,
renormalize
=
True
,
inplace
=
True
,
override_config
=
config
,
use_fp8
=
use_fp8
,
w1_scale
=
w1_scale
,
w2_scale
=
w2_scale
,
a1_scale
=
a1_scale
,
a2_scale
=
a2_scale
,
)
# JIT compilation & warmup
run
()
torch
.
cuda
.
synchronize
()
# Capture 10 invocations with CUDA graph
graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
graph
):
for
_
in
range
(
10
):
run
()
torch
.
cuda
.
synchronize
()
# Warmup
for
_
in
range
(
5
):
graph
.
replay
()
torch
.
cuda
.
synchronize
()
start_event
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
end_event
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
latencies
=
[]
for
i
in
range
(
num_iters
):
prepare
(
i
)
torch
.
cuda
.
synchronize
()
start_event
.
record
()
graph
.
replay
()
end_event
.
record
()
end_event
.
synchronize
()
latencies
.
append
(
start_event
.
elapsed_time
(
end_event
))
avg
=
sum
(
latencies
)
/
(
num_iters
*
10
)
*
1000
# us
graph
.
reset
()
return
avg
def
get_configs_compute_bound
()
->
List
[
Dict
[
str
,
int
]]:
# Reduced search space for faster tuning.
# TODO(woosuk): Increase the search space and use a performance model to
# prune the search space.
configs
=
[]
for
num_stages
in
[
2
,
3
,
4
,
5
]:
for
block_m
in
[
16
,
32
,
64
,
128
,
256
]:
for
block_k
in
[
64
,
128
,
256
]:
for
block_n
in
[
32
,
64
,
128
,
256
]:
for
num_warps
in
[
4
,
8
]:
for
group_size
in
[
1
,
16
,
32
,
64
]:
configs
.
append
({
"BLOCK_SIZE_M"
:
block_m
,
"BLOCK_SIZE_N"
:
block_n
,
"BLOCK_SIZE_K"
:
block_k
,
"GROUP_SIZE_M"
:
group_size
,
"num_warps"
:
num_warps
,
"num_stages"
:
num_stages
,
})
return
configs
@
ray
.
remote
(
num_gpus
=
1
)
class
BenchmarkWorker
:
def
__init__
(
self
,
seed
:
int
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
seed
)
self
.
seed
=
seed
def
benchmark
(
self
,
num_tokens
:
int
,
num_experts
:
int
,
shard_intermediate_size
:
int
,
hidden_size
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
use_fp8
:
bool
,
)
->
Tuple
[
Dict
[
str
,
int
],
float
]:
torch
.
cuda
.
manual_seed_all
(
self
.
seed
)
dtype_str
=
"float8"
if
use_fp8
else
None
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
# is the intermediate size after silu_and_mul.
op_config
=
get_moe_configs
(
num_experts
,
shard_intermediate_size
//
2
,
dtype_str
)
if
op_config
is
None
:
config
=
get_default_config
(
num_tokens
,
num_experts
,
shard_intermediate_size
,
hidden_size
,
topk
,
dtype_str
)
else
:
config
=
op_config
[
min
(
op_config
.
keys
(),
key
=
lambda
x
:
abs
(
x
-
num_tokens
))]
kernel_time
=
benchmark_config
(
config
,
num_tokens
,
num_experts
,
shard_intermediate_size
,
hidden_size
,
topk
,
dtype
,
use_fp8
)
return
config
,
kernel_time
def
tune
(
self
,
num_tokens
:
int
,
num_experts
:
int
,
shard_intermediate_size
:
int
,
hidden_size
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
use_fp8
:
bool
,
search_space
:
List
[
Dict
[
str
,
int
]],
)
->
Dict
[
str
,
int
]:
best_config
=
None
best_time
=
float
(
"inf"
)
for
config
in
tqdm
(
search_space
):
try
:
kernel_time
=
benchmark_config
(
config
,
num_tokens
,
num_experts
,
shard_intermediate_size
,
hidden_size
,
topk
,
dtype
,
use_fp8
,
num_iters
=
10
)
except
triton
.
runtime
.
autotuner
.
OutOfResources
:
# Some configurations may be invalid and fail to compile.
continue
if
kernel_time
<
best_time
:
best_time
=
kernel_time
best_config
=
config
now
=
datetime
.
now
()
print
(
f
"
{
now
.
ctime
()
}
] Completed tuning for batch_size=
{
num_tokens
}
"
)
return
best_config
def
sort_config
(
config
:
Dict
[
str
,
int
])
->
Dict
[
str
,
int
]:
return
{
"BLOCK_SIZE_M"
:
config
[
"BLOCK_SIZE_M"
],
"BLOCK_SIZE_N"
:
config
[
"BLOCK_SIZE_N"
],
"BLOCK_SIZE_K"
:
config
[
"BLOCK_SIZE_K"
],
"GROUP_SIZE_M"
:
config
[
"GROUP_SIZE_M"
],
"num_warps"
:
config
[
"num_warps"
],
"num_stages"
:
config
[
"num_stages"
],
}
def
save_configs
(
configs
:
Dict
[
int
,
Dict
[
str
,
int
]],
num_experts
:
int
,
shard_intermediate_size
:
int
,
hidden_size
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
use_fp8
:
bool
,
)
->
None
:
dtype_str
=
"float8"
if
use_fp8
else
None
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
# is the intermediate size after silu_and_mul.
filename
=
get_config_file_name
(
num_experts
,
shard_intermediate_size
//
2
,
dtype_str
)
print
(
f
"Writing best config to
{
filename
}
..."
)
with
open
(
filename
,
"w"
)
as
f
:
json
.
dump
(
configs
,
f
,
indent
=
4
)
f
.
write
(
"
\n
"
)
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
config
=
AutoConfig
.
from_pretrained
(
args
.
model
)
if
config
.
architectures
[
0
]
==
"DbrxForCausalLM"
:
E
=
config
.
ffn_config
.
moe_num_experts
topk
=
config
.
ffn_config
.
moe_top_k
intermediate_size
=
config
.
ffn_config
.
ffn_hidden_size
shard_intermediate_size
=
2
*
intermediate_size
//
args
.
tp_size
else
:
# Default: Mixtral.
E
=
config
.
num_local_experts
topk
=
config
.
num_experts_per_tok
intermediate_size
=
config
.
intermediate_size
shard_intermediate_size
=
2
*
intermediate_size
//
args
.
tp_size
hidden_size
=
config
.
hidden_size
dtype
=
config
.
torch_dtype
use_fp8
=
args
.
dtype
==
"fp8"
if
args
.
batch_size
is
None
:
batch_sizes
=
[
1
,
2
,
4
,
8
,
16
,
24
,
32
,
48
,
64
,
96
,
128
,
256
,
512
,
1024
,
1536
,
2048
,
3072
,
4096
]
else
:
batch_sizes
=
[
args
.
batch_size
]
ray
.
init
()
num_gpus
=
int
(
ray
.
available_resources
()[
"GPU"
])
workers
=
[
BenchmarkWorker
.
remote
(
args
.
seed
)
for
_
in
range
(
num_gpus
)]
def
_distribute
(
method
:
str
,
inputs
:
List
[
Any
])
->
List
[
Any
]:
outputs
=
[]
worker_idx
=
0
for
input_args
in
inputs
:
worker
=
workers
[
worker_idx
]
worker_method
=
getattr
(
worker
,
method
)
output
=
worker_method
.
remote
(
*
input_args
)
outputs
.
append
(
output
)
worker_idx
=
(
worker_idx
+
1
)
%
num_gpus
return
ray
.
get
(
outputs
)
if
args
.
tune
:
search_space
=
get_configs_compute_bound
()
print
(
f
"Start tuning over
{
len
(
search_space
)
}
configurations..."
)
start
=
time
.
time
()
configs
=
_distribute
(
"tune"
,
[(
batch_size
,
E
,
shard_intermediate_size
,
hidden_size
,
topk
,
dtype
,
use_fp8
,
search_space
)
for
batch_size
in
batch_sizes
])
best_configs
=
{
M
:
sort_config
(
config
)
for
M
,
config
in
zip
(
batch_sizes
,
configs
)
}
save_configs
(
best_configs
,
E
,
shard_intermediate_size
,
hidden_size
,
topk
,
dtype
,
use_fp8
)
end
=
time
.
time
()
print
(
f
"Tuning took
{
end
-
start
:.
2
f
}
seconds"
)
else
:
outputs
=
_distribute
(
"benchmark"
,
[(
batch_size
,
E
,
shard_intermediate_size
,
hidden_size
,
topk
,
dtype
,
use_fp8
)
for
batch_size
in
batch_sizes
])
for
batch_size
,
(
config
,
kernel_time
)
in
zip
(
batch_sizes
,
outputs
):
print
(
f
"Batch size:
{
batch_size
}
, config:
{
config
}
"
)
print
(
f
"Kernel time:
{
kernel_time
:.
2
f
}
us"
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"mistralai/Mixtral-8x7B-Instruct-v0.1"
)
parser
.
add_argument
(
"--tp-size"
,
"-tp"
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
"--dtype"
,
type
=
str
,
choices
=
[
"auto"
,
"fp8"
],
default
=
"auto"
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--batch-size"
,
type
=
int
,
required
=
False
)
parser
.
add_argument
(
"--tune"
,
action
=
"store_true"
)
args
=
parser
.
parse_args
()
main
(
args
)
Prev
1
2
3
4
5
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment