Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
081057de
Commit
081057de
authored
Apr 29, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.5' into v0.8.5-ori
parents
7cf5d5c4
ba41cc90
Changes
689
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
353 additions
and
35 deletions
+353
-35
.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
...configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
+1
-0
.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
...dkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+1
-0
.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
...l-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+1
-0
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+1
-1
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+15
-0
.buildkite/scripts/hardware_ci/run-amd-test.sh
.buildkite/scripts/hardware_ci/run-amd-test.sh
+7
-0
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+33
-2
.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
+13
-0
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+9
-2
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+59
-7
.github/CODEOWNERS
.github/CODEOWNERS
+1
-0
.github/ISSUE_TEMPLATE/200-installation.yml
.github/ISSUE_TEMPLATE/200-installation.yml
+1
-1
.github/ISSUE_TEMPLATE/300-usage.yml
.github/ISSUE_TEMPLATE/300-usage.yml
+1
-1
.github/ISSUE_TEMPLATE/400-bug-report.yml
.github/ISSUE_TEMPLATE/400-bug-report.yml
+1
-1
.github/ISSUE_TEMPLATE/700-performance-discussion.yml
.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+1
-1
.github/mergify.yml
.github/mergify.yml
+32
-2
.gitignore
.gitignore
+3
-1
.pre-commit-config.yaml
.pre-commit-config.yaml
+0
-1
CMakeLists.txt
CMakeLists.txt
+66
-15
benchmarks/backend_request_func.py
benchmarks/backend_request_func.py
+107
-0
No files found.
.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
View file @
081057de
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
model_name
:
"
nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
model_name
:
"
nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
tasks
:
tasks
:
...
...
.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
View file @
081057de
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
model_name
:
"
Qwen/Qwen2-57B-A14B-Instruct"
model_name
:
"
Qwen/Qwen2-57B-A14B-Instruct"
tasks
:
tasks
:
...
...
.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
View file @
081057de
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
model_name
:
"
nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
model_name
:
"
nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
tasks
:
tasks
:
...
...
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
View file @
081057de
...
@@ -16,7 +16,7 @@ import numpy
...
@@ -16,7 +16,7 @@ import numpy
import
pytest
import
pytest
import
yaml
import
yaml
RTOL
=
0.0
5
RTOL
=
0.0
8
TEST_DATA_FILE
=
os
.
environ
.
get
(
TEST_DATA_FILE
=
os
.
environ
.
get
(
"LM_EVAL_TEST_DATA_FILE"
,
"LM_EVAL_TEST_DATA_FILE"
,
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml"
)
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml"
)
...
...
.buildkite/release-pipeline.yaml
View file @
081057de
...
@@ -86,3 +86,18 @@ steps:
...
@@ -86,3 +86,18 @@ steps:
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent
meta-data
get
release-version)"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent
meta-data
get
release-version)"
env
:
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKER_BUILDKIT
:
"
1"
-
block
:
"
Build
Neuron
release
image"
key
:
block-neuron-release-image-build
depends_on
:
~
-
label
:
"
Build
and
publish
Neuron
release
image"
depends_on
:
block-neuron-release-image-build
agents
:
queue
:
neuron-postmerge
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--tag
public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent
meta-data
get
release-version)
--tag
public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest
--progress
plain
-f
docker/Dockerfile.neuron
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent
meta-data
get
release-version)"
env
:
DOCKER_BUILDKIT
:
"
1"
.buildkite/scripts/hardware_ci/run-amd-test.sh
View file @
081057de
...
@@ -98,6 +98,13 @@ if [[ $commands == *" kernels "* ]]; then
...
@@ -98,6 +98,13 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_machete_mm.py
\
--ignore=kernels/test_machete_mm.py
\
--ignore=kernels/test_mha_attn.py
\
--ignore=kernels/test_mha_attn.py
\
--ignore=kernels/test_block_fp8.py
\
--ignore=kernels/test_block_fp8.py
\
--ignore=kernels/test_cutlass_moe.py
\
--ignore=kernels/test_mamba_ssm_ssd.py
\
--ignore=kernels/test_attention.py
\
--ignore=kernels/test_block_int8.py
\
--ignore=kernels/test_fused_quant_layernorm.py
\
--ignore=kernels/test_int8_kernel.py
\
--ignore=kernels/test_triton_moe_ptpc_fp8.py
\
--ignore=kernels/test_permute_cols.py"
--ignore=kernels/test_permute_cols.py"
fi
fi
...
...
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
View file @
081057de
...
@@ -5,10 +5,41 @@
...
@@ -5,10 +5,41 @@
set
-ex
set
-ex
# Setup cleanup
# Setup cleanup
remove_docker_container
()
{
docker
rm
-f
cpu-test
||
true
;
docker system prune
-f
;
}
remove_docker_container
()
{
if
[[
-n
"
$container_id
"
]]
;
then
podman
rm
-f
"
$container_id
"
||
true
fi
podman system prune
-f
}
trap
remove_docker_container EXIT
trap
remove_docker_container EXIT
remove_docker_container
remove_docker_container
# Try building the docker image
# Try building the docker image
docker build
-t
cpu-test
-f
docker/Dockerfile.ppc64le
.
podman build
-t
cpu-test-ubi9-ppc
-f
docker/Dockerfile.ppc64le
.
# Run the image
container_id
=
$(
podman run
-itd
--entrypoint
/bin/bash
-v
/tmp/:/root/.cache/huggingface
--privileged
=
true
--network
host
-e
HF_TOKEN cpu-test-ubi9-ppc
)
function
cpu_tests
()
{
# offline inference
podman
exec
-it
"
$container_id
"
bash
-c
"
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
podman
exec
-it
"
$container_id
"
bash
-c
"
set -e
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
}
# All of CPU tests are expected to be finished less than 40 mins.
export
container_id
export
-f
cpu_tests
timeout
40m bash
-c
cpu_tests
.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
0 → 100755
View file @
081057de
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set
-ex
# Setup cleanup
remove_docker_container
()
{
docker
rm
-f
cpu-test
||
true
;
docker system prune
-f
;
}
trap
remove_docker_container EXIT
remove_docker_container
# Try building the docker image
docker build
-t
cpu-test
-f
docker/Dockerfile.s390x
.
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
View file @
081057de
...
@@ -17,10 +17,13 @@ source /etc/environment
...
@@ -17,10 +17,13 @@ source /etc/environment
docker run
--privileged
--net
host
--shm-size
=
16G
-it
\
docker run
--privileged
--net
host
--shm-size
=
16G
-it
\
-e
"HF_TOKEN=
$HF_TOKEN
"
--name
tpu-test
\
-e
"HF_TOKEN=
$HF_TOKEN
"
--name
tpu-test
\
vllm-tpu /bin/bash
-c
"python3 -m pip install git+https://github.com/thuml/depyf.git
\
vllm-tpu /bin/bash
-c
"python3 -m pip install git+https://github.com/thuml/depyf.git
\
&& python3 -m pip install pytest
\
&& python3 -m pip install pytest
pytest-asyncio tpu-info
\
&& python3 -m pip install lm_eval[api]==0.4.4
\
&& python3 -m pip install lm_eval[api]==0.4.4
\
&& export VLLM_XLA_CACHE_PATH=
\
&& export VLLM_USE_V1=1
\
&& export VLLM_USE_V1=1
\
&& export VLLM_XLA_CHECK_RECOMPILATION=1
\
&& export VLLM_XLA_CHECK_RECOMPILATION=1
\
&& echo HARDWARE
\
&& tpu-info
\
&& echo TEST_0
\
&& echo TEST_0
\
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py
\
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py
\
&& echo TEST_1
\
&& echo TEST_1
\
...
@@ -40,7 +43,11 @@ docker run --privileged --net host --shm-size=16G -it \
...
@@ -40,7 +43,11 @@ docker run --privileged --net host --shm-size=16G -it \
&& echo TEST_8
\
&& echo TEST_8
\
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py
\
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py
\
&& echo TEST_9
\
&& echo TEST_9
\
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
\
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py
\
&& echo TEST_10
\
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py
\
&& echo TEST_11
\
&& pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
\
# TODO: This test fails because it uses RANDOM_SEED sampling
# TODO: This test fails because it uses RANDOM_SEED sampling
...
...
.buildkite/test-pipeline.yaml
View file @
081057de
...
@@ -8,6 +8,7 @@
...
@@ -8,6 +8,7 @@
# Documentation
# Documentation
# label(str): the name of the test. emoji allowed.
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
# command(str): the single command to run for tests. incompatible with commands.
# command(str): the single command to run for tests. incompatible with commands.
...
@@ -70,6 +71,7 @@ steps:
...
@@ -70,6 +71,7 @@ steps:
-
label
:
Basic Correctness Test
# 30min
-
label
:
Basic Correctness Test
# 30min
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
fast_check
:
true
fast_check
:
true
torch_nightly
:
true
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
-
tests/basic_correctness/test_basic_correctness
-
tests/basic_correctness/test_basic_correctness
...
@@ -104,6 +106,7 @@ steps:
...
@@ -104,6 +106,7 @@ steps:
-
label
:
Entrypoints Test
# 40min
-
label
:
Entrypoints Test
# 40min
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
fast_check
:
true
fast_check
:
true
torch_nightly
:
true
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
...
@@ -118,7 +121,7 @@ steps:
...
@@ -118,7 +121,7 @@ steps:
-
pytest -v -s entrypoints/llm/test_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/llm/test_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/llm/test_generate_multiple_loras.py
# it needs a clean process
-
pytest -v -s entrypoints/llm/test_generate_multiple_loras.py
# it needs a clean process
-
VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py
# it needs a clean process
-
VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
--ignore=entrypoints/openai/test_openai_schema.py
-
pytest -v -s entrypoints/test_chat_utils.py
-
pytest -v -s entrypoints/test_chat_utils.py
-
VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode
# Needs to avoid interference with other tests
-
VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode
# Needs to avoid interference with other tests
...
@@ -205,6 +208,8 @@ steps:
...
@@ -205,6 +208,8 @@ steps:
-
pytest -v -s v1/sample
-
pytest -v -s v1/sample
-
pytest -v -s v1/worker
-
pytest -v -s v1/worker
-
pytest -v -s v1/structured_output
-
pytest -v -s v1/structured_output
-
pytest -v -s v1/spec_decode
-
pytest -v -s v1/test_serial_utils.py
-
pytest -v -s v1/test_stats.py
-
pytest -v -s v1/test_stats.py
-
pytest -v -s v1/test_utils.py
-
pytest -v -s v1/test_utils.py
-
pytest -v -s v1/test_oracle.py
-
pytest -v -s v1/test_oracle.py
...
@@ -294,6 +299,7 @@ steps:
...
@@ -294,6 +299,7 @@ steps:
commands
:
commands
:
-
pytest -v -s compile/test_pass_manager.py
-
pytest -v -s compile/test_pass_manager.py
-
pytest -v -s compile/test_fusion.py
-
pytest -v -s compile/test_fusion.py
-
pytest -v -s compile/test_sequence_parallelism.py
-
label
:
PyTorch Fullgraph Smoke Test
# 9min
-
label
:
PyTorch Fullgraph Smoke Test
# 9min
source_file_dependencies
:
source_file_dependencies
:
...
@@ -312,15 +318,46 @@ steps:
...
@@ -312,15 +318,46 @@ steps:
commands
:
commands
:
-
pytest -v -s compile/test_full_graph.py
-
pytest -v -s compile/test_full_graph.py
-
label
:
Kernels Test %N
# 1h each
-
label
:
Kernels Core Operation Test
# mirror_hardwares: [amd]
source_file_dependencies
:
source_file_dependencies
:
-
csrc/
-
csrc/
-
tests/kernels/core
commands
:
-
pytest -v -s kernels/core
-
label
:
Kernels Attention Test %N
source_file_dependencies
:
-
csrc/attention/
-
vllm/attention
-
vllm/attention
-
tests/kernels
-
vllm/v1/attention
-
tests/kernels/attention
commands
:
commands
:
-
pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-
pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism
:
4
parallelism
:
2
-
label
:
Kernels Quantization Test %N
source_file_dependencies
:
-
csrc/quantization/
-
vllm/model_executor/layers/quantization
-
tests/kernels/quantization
commands
:
-
pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism
:
2
-
label
:
Kernels MoE Test
source_file_dependencies
:
-
csrc/moe/
-
tests/kernels/moe
-
vllm/model_executor/layers/fused_moe/
commands
:
-
pytest -v -s kernels/moe
-
label
:
Kernels Mamba Test
source_file_dependencies
:
-
csrc/mamba/
-
tests/kernels/mamba
commands
:
-
pytest -v -s kernels/mamba
-
label
:
Tensorizer Test
# 11min
-
label
:
Tensorizer Test
# 11min
# mirror_hardwares: [amd]
# mirror_hardwares: [amd]
...
@@ -341,6 +378,13 @@ steps:
...
@@ -341,6 +378,13 @@ steps:
commands
:
commands
:
-
bash scripts/run-benchmarks.sh
-
bash scripts/run-benchmarks.sh
-
label
:
Benchmarks CLI Test
# 10min
source_file_dependencies
:
-
vllm/
-
tests/benchmarks/
commands
:
-
pytest -v -s benchmarks/
-
label
:
Quantization Test
# 33min
-
label
:
Quantization Test
# 33min
source_file_dependencies
:
source_file_dependencies
:
-
csrc/
-
csrc/
...
@@ -393,8 +437,9 @@ steps:
...
@@ -393,8 +437,9 @@ steps:
-
pytest -v -s models/test_transformers.py
-
pytest -v -s models/test_transformers.py
-
pytest -v -s models/test_registry.py
-
pytest -v -s models/test_registry.py
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
-
VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4'
-
VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4
and not plamo2
'
-
VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
-
VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
-
VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
-
label
:
Language Models Test (Standard)
# 32min
-
label
:
Language Models Test (Standard)
# 32min
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
...
@@ -404,6 +449,8 @@ steps:
...
@@ -404,6 +449,8 @@ steps:
-
tests/models/embedding/language
-
tests/models/embedding/language
-
tests/models/encoder_decoder/language
-
tests/models/encoder_decoder/language
commands
:
commands
:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-
pip install causal-conv1d
-
pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
-
pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
-
pytest -v -s models/embedding/language -m core_model
-
pytest -v -s models/embedding/language -m core_model
...
@@ -415,6 +462,8 @@ steps:
...
@@ -415,6 +462,8 @@ steps:
-
tests/models/embedding/language
-
tests/models/embedding/language
-
tests/models/encoder_decoder/language
-
tests/models/encoder_decoder/language
commands
:
commands
:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-
pip install causal-conv1d
-
pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
-
pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
-
pytest -v -s models/embedding/language -m 'not core_model'
-
pytest -v -s models/embedding/language -m 'not core_model'
...
@@ -535,11 +584,14 @@ steps:
...
@@ -535,11 +584,14 @@ steps:
-
pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
-
pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
-
pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
-
pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
-
pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
-
pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
# test sequence parallel
-
pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently.
# this test fails consistently.
# TODO: investigate and fix
# TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-
VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-
VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-
VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
-
VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
-
label
:
Plugin Tests (2 GPUs)
# 40min
-
label
:
Plugin Tests (2 GPUs)
# 40min
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
...
...
.github/CODEOWNERS
View file @
081057de
...
@@ -12,6 +12,7 @@
...
@@ -12,6 +12,7 @@
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/multimodal @DarkLight1337 @ywang96
/vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson
CMakeLists.txt @tlrmchlsmth
CMakeLists.txt @tlrmchlsmth
# vLLM V1
# vLLM V1
...
...
.github/ISSUE_TEMPLATE/200-installation.yml
View file @
081057de
...
@@ -14,7 +14,7 @@ body:
...
@@ -14,7 +14,7 @@ body:
description
:
|
description
:
|
Please run the following and paste the output below.
Please run the following and paste the output below.
```sh
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
wget https://raw.githubusercontent.com/vllm-project/vllm/main/
vllm/
collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
python collect_env.py
```
```
...
...
.github/ISSUE_TEMPLATE/300-usage.yml
View file @
081057de
...
@@ -14,7 +14,7 @@ body:
...
@@ -14,7 +14,7 @@ body:
description
:
|
description
:
|
Please run the following and paste the output below.
Please run the following and paste the output below.
```sh
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
wget https://raw.githubusercontent.com/vllm-project/vllm/main/
vllm/
collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
python collect_env.py
```
```
...
...
.github/ISSUE_TEMPLATE/400-bug-report.yml
View file @
081057de
...
@@ -14,7 +14,7 @@ body:
...
@@ -14,7 +14,7 @@ body:
description
:
|
description
:
|
Please run the following and paste the output below.
Please run the following and paste the output below.
```sh
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
wget https://raw.githubusercontent.com/vllm-project/vllm/main/
vllm/
collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
python collect_env.py
```
```
...
...
.github/ISSUE_TEMPLATE/700-performance-discussion.yml
View file @
081057de
...
@@ -35,7 +35,7 @@ body:
...
@@ -35,7 +35,7 @@ body:
description
:
|
description
:
|
Please run the following and paste the output below.
Please run the following and paste the output below.
```sh
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
wget https://raw.githubusercontent.com/vllm-project/vllm/main/
vllm/
collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
python collect_env.py
```
```
...
...
.github/mergify.yml
View file @
081057de
...
@@ -55,11 +55,19 @@ pull_request_rules:
...
@@ -55,11 +55,19 @@ pull_request_rules:
description
:
Automatically apply structured-output label
description
:
Automatically apply structured-output label
conditions
:
conditions
:
-
or
:
-
or
:
-
files~=^benchmarks/structured_schemas/
-
files=benchmarks/benchmark_serving_structured_output.py
-
files=benchmarks/run_structured_output_benchmark.sh
-
files=docs/source/features/structured_outputs.md
-
files=examples/offline_inference/structured_outputs.py
-
files=examples/online_serving/openai_chat_completion_structured_outputs.py
-
files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
-
files~=^vllm/model_executor/guided_decoding/
-
files~=^vllm/model_executor/guided_decoding/
-
files=tests/model_executor/test_guided_processors.py
-
files=tests/model_executor/test_guided_processors.py
-
files=tests/entrypoints/llm/test_guided_generate.py
-
files=tests/entrypoints/llm/test_guided_generate.py
-
files=benchmarks/benchmark_serving_guided.py
-
files~=^tests/v1/structured_output/
-
files=benchmarks/benchmark_guided.py
-
files=tests/v1/entrypoints/llm/test_guided_generate.py
-
files~=^vllm/v1/structured_output/
actions
:
actions
:
label
:
label
:
add
:
add
:
...
@@ -118,6 +126,28 @@ pull_request_rules:
...
@@ -118,6 +126,28 @@ pull_request_rules:
remove
:
remove
:
-
tpu
-
tpu
-
name
:
label-tool-calling
description
:
Automatically add tool-calling label
conditions
:
-
or
:
-
files~=^tests/tool_use/
-
files~=^tests/mistral_tool_use/
-
files~=^tests/entrypoints/openai/tool_parsers/
-
files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
-
files~=^vllm/entrypoints/openai/tool_parsers/
-
files=docs/source/features/tool_calling.md
-
files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
-
files=docs/source/getting_started/examples/chat_with_tools.md
-
files~=^examples/tool_chat_*
-
files=examples/offline_inference/chat_with_tools.py
-
files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
-
files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
-
files=examples/online_serving/openai_chat_completion_client_with_tools.py
actions
:
label
:
add
:
-
tool-calling
-
name
:
ping author on conflicts and add 'needs-rebase' label
-
name
:
ping author on conflicts and add 'needs-rebase' label
conditions
:
conditions
:
-
conflict
-
conflict
...
...
.gitignore
View file @
081057de
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
# vllm-flash-attn built from source
# vllm-flash-attn built from source
vllm/vllm_flash_attn/*
vllm/vllm_flash_attn/*
!vllm/vllm_flash_attn/fa_utils.py
# Byte-compiled / optimized / DLL files
# Byte-compiled / optimized / DLL files
__pycache__/
__pycache__/
...
@@ -203,3 +202,6 @@ benchmarks/**/*.json
...
@@ -203,3 +202,6 @@ benchmarks/**/*.json
# Linting
# Linting
actionlint
actionlint
shellcheck*/
shellcheck*/
# Ingore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_*
.pre-commit-config.yaml
View file @
081057de
...
@@ -11,7 +11,6 @@ repos:
...
@@ -11,7 +11,6 @@ repos:
hooks
:
hooks
:
-
id
:
yapf
-
id
:
yapf
args
:
[
--in-place
,
--verbose
]
args
:
[
--in-place
,
--verbose
]
additional_dependencies
:
[
toml
]
# TODO: Remove when yapf is upgraded
-
repo
:
https://github.com/astral-sh/ruff-pre-commit
-
repo
:
https://github.com/astral-sh/ruff-pre-commit
rev
:
v0.9.3
rev
:
v0.9.3
hooks
:
hooks
:
...
...
CMakeLists.txt
View file @
081057de
...
@@ -252,7 +252,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -252,7 +252,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
# Please keep this in sync with FetchContent_Declare line below.
# Please keep this in sync with FetchContent_Declare line below.
set
(
CUTLASS_REVISION
"v3.
8
.0"
CACHE STRING
"CUTLASS revision to use"
)
set
(
CUTLASS_REVISION
"v3.
9
.0"
CACHE STRING
"CUTLASS revision to use"
)
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if
(
DEFINED ENV{VLLM_CUTLASS_SRC_DIR}
)
if
(
DEFINED ENV{VLLM_CUTLASS_SRC_DIR}
)
...
@@ -270,7 +270,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -270,7 +270,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cutlass
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# Please keep this in sync with CUTLASS_REVISION line above.
# Please keep this in sync with CUTLASS_REVISION line above.
GIT_TAG v3.
8
.0
GIT_TAG v3.
9
.0
GIT_PROGRESS TRUE
GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
...
@@ -291,7 +291,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -291,7 +291,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp"
)
"csrc/cutlass_extensions/common.cpp"
"csrc/attention/mla/cutlass_mla_entry.cu"
)
set_gencode_flags_for_srcs
(
set_gencode_flags_for_srcs
(
SRCS
"
${
VLLM_EXT_SRC
}
"
SRCS
"
${
VLLM_EXT_SRC
}
"
...
@@ -464,7 +465,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -464,7 +465,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set
(
FP4_ARCHS
)
set
(
FP4_ARCHS
)
endif
()
endif
()
#
# CUTLASS MLA Archs and flags
cuda_archs_loose_intersection
(
MLA_ARCHS
"10.0a"
"
${
CUDA_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.8 AND MLA_ARCHS
)
set
(
SRCS
"csrc/attention/mla/cutlass_mla_kernels.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
SRCS
}
"
CUDA_ARCHS
"
${
MLA_ARCHS
}
"
)
list
(
APPEND VLLM_EXT_SRC
"
${
SRCS
}
"
)
list
(
APPEND VLLM_GPU_FLAGS
"-DENABLE_CUTLASS_MLA=1"
)
# Add MLA-specific include directories only to MLA source files
set_source_files_properties
(
${
SRCS
}
PROPERTIES INCLUDE_DIRECTORIES
"
${
CUTLASS_DIR
}
/examples/77_blackwell_fmha;
${
CUTLASS_DIR
}
/examples/common"
)
message
(
STATUS
"Building CUTLASS MLA for archs:
${
MLA_ARCHS
}
"
)
else
()
message
(
STATUS
"Not building CUTLASS MLA as no compatible archs were found."
)
# clear MLA_ARCHS
set
(
MLA_ARCHS
)
endif
()
# CUTLASS MoE kernels
# CUTLASS MoE kernels
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
...
@@ -610,21 +630,51 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -610,21 +630,51 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list
(
APPEND VLLM_MOE_EXT_SRC
"
${
VLLM_MOE_WNA16_SRC
}
"
)
list
(
APPEND VLLM_MOE_EXT_SRC
"
${
VLLM_MOE_WNA16_SRC
}
"
)
cuda_archs_loose_intersection
(
MARLIN_MOE_ARCHS
"8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0"
"
${
CUDA_ARCHS
}
"
)
cuda_archs_loose_intersection
(
MARLIN_MOE_ARCHS
"8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0"
"
${
CUDA_ARCHS
}
"
)
if
(
MARLIN_MOE_ARCHS
)
if
(
MARLIN_MOE_ARCHS
)
set
(
MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
"csrc/moe/marlin_moe_ops.cu"
)
#
# For the Marlin MOE kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# Generate sources:
set
(
MOE_MARLIN_GEN_SCRIPT
${
CMAKE_CURRENT_SOURCE_DIR
}
/csrc/moe/marlin_moe_wna16/generate_kernels.py
)
file
(
MD5
${
MOE_MARLIN_GEN_SCRIPT
}
MOE_MARLIN_GEN_SCRIPT_HASH
)
message
(
STATUS
"Marlin MOE generation script hash:
${
MOE_MARLIN_GEN_SCRIPT_HASH
}
"
)
message
(
STATUS
"Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}"
)
if
(
NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL
${
MOE_MARLIN_GEN_SCRIPT_HASH
}
)
execute_process
(
COMMAND
${
CMAKE_COMMAND
}
-E env
PYTHONPATH=
${
CMAKE_CURRENT_SOURCE_DIR
}
/csrc/cutlass_extensions/:
${
CUTLASS_DIR
}
/python/:
${
VLLM_PYTHON_PATH
}
:$PYTHONPATH
${
Python_EXECUTABLE
}
${
MOE_MARLIN_GEN_SCRIPT
}
RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output
OUTPUT_FILE
${
CMAKE_CURRENT_BINARY_DIR
}
/moe_marlin_generation.log
ERROR_FILE
${
CMAKE_CURRENT_BINARY_DIR
}
/moe_marlin_generation.log
)
if
(
NOT moe_marlin_generation_result EQUAL 0
)
message
(
FATAL_ERROR
"Marlin MOE generation failed."
" Result:
\"
${
moe_marlin_generation_result
}
\"
"
"
\n
Check the log for details: "
"
${
CMAKE_CURRENT_BINARY_DIR
}
/moe_marlin_generation.log"
)
else
()
set
(
MOE_MARLIN_GEN_SCRIPT_HASH
${
MOE_MARLIN_GEN_SCRIPT_HASH
}
CACHE STRING
"Last run Marlin MOE generate script hash"
FORCE
)
message
(
STATUS
"Marlin MOE generation completed successfully."
)
endif
()
else
()
message
(
STATUS
"Marlin MOE generation script has not changed, skipping generation."
)
endif
()
file
(
GLOB MOE_WNAA16_MARLIN_SRC
"csrc/moe/marlin_moe_wna16/*.cu"
)
set_gencode_flags_for_srcs
(
set_gencode_flags_for_srcs
(
SRCS
"
${
MARLIN_
MOE_
SRC
}
"
SRCS
"
${
MOE_WNAA16_
MARLIN_SRC
}
"
CUDA_ARCHS
"
${
MARLIN_MOE_ARCHS
}
"
)
CUDA_ARCHS
"
${
MARLIN_MOE_ARCHS
}
"
)
list
(
APPEND VLLM_MOE_EXT_SRC
"
${
MARLIN_MOE_SRC
}
"
)
list
(
APPEND VLLM_MOE_EXT_SRC
${
MOE_WNAA16_MARLIN_SRC
}
)
message
(
STATUS
"Building Marlin MOE kernels for archs:
${
MARLIN_MOE_ARCHS
}
"
)
message
(
STATUS
"Building Marlin MOE kernels for archs:
${
MARLIN_MOE_ARCHS
}
"
)
else
()
else
()
message
(
STATUS
"Not building Marlin MOE kernels as no compatible archs found"
message
(
STATUS
"Not building Marlin MOE kernels as no compatible archs found"
...
@@ -650,6 +700,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
...
@@ -650,6 +700,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
#
#
set(VLLM_ROCM_EXT_SRC
set(VLLM_ROCM_EXT_SRC
"csrc/rocm/torch_bindings.cpp"
"csrc/rocm/torch_bindings.cpp"
"csrc/rocm/skinny_gemms.cu"
"csrc/rocm/attention.cu")
"csrc/rocm/attention.cu")
define_gpu_extension_target(
define_gpu_extension_target(
...
...
benchmarks/backend_request_func.py
View file @
081057de
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
io
import
json
import
json
import
os
import
os
import
sys
import
sys
...
@@ -32,6 +33,7 @@ class RequestFuncInput:
...
@@ -32,6 +33,7 @@ class RequestFuncInput:
extra_body
:
Optional
[
dict
]
=
None
extra_body
:
Optional
[
dict
]
=
None
multi_modal_content
:
Optional
[
dict
]
=
None
multi_modal_content
:
Optional
[
dict
]
=
None
ignore_eos
:
bool
=
False
ignore_eos
:
bool
=
False
language
:
Optional
[
str
]
=
None
@
dataclass
@
dataclass
...
@@ -436,6 +438,110 @@ async def async_request_openai_chat_completions(
...
@@ -436,6 +438,110 @@ async def async_request_openai_chat_completions(
return
output
return
output
async
def
async_request_openai_audio
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
# Lazy import without PlaceholderModule to avoid vllm dep.
import
soundfile
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
(
"transcriptions"
,
"translations"
)),
"OpenAI Chat Completions API URL must end with 'transcriptions' "
"or `translations`."
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
content
=
[{
"type"
:
"text"
,
"text"
:
request_func_input
.
prompt
}]
payload
=
{
"model"
:
request_func_input
.
model_name
\
if
request_func_input
.
model_name
else
request_func_input
.
model
,
"temperature"
:
0.0
,
"max_completion_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
"language"
:
"en"
,
# Flattened due to multipart/form-data
"stream_include_usage"
:
True
,
"stream_continuous_usage_stats"
:
True
}
if
request_func_input
.
extra_body
:
payload
.
update
(
request_func_input
.
extra_body
)
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
# Send audio file
def
to_bytes
(
y
,
sr
):
buffer
=
io
.
BytesIO
()
soundfile
.
write
(
buffer
,
y
,
sr
,
format
=
"WAV"
)
buffer
.
seek
(
0
)
return
buffer
with
to_bytes
(
*
request_func_input
.
multi_modal_content
[
'audio'
])
as
f
:
form
=
aiohttp
.
FormData
()
form
.
add_field
(
'file'
,
f
,
content_type
=
'audio/wav'
)
for
key
,
value
in
payload
.
items
():
form
.
add_field
(
key
,
str
(
value
))
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
generated_text
=
""
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
data
=
form
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
"data: "
)
if
chunk
!=
"[DONE]"
:
timestamp
=
time
.
perf_counter
()
data
=
json
.
loads
(
chunk
)
if
choices
:
=
data
.
get
(
"choices"
):
content
=
choices
[
0
][
"delta"
].
get
(
"content"
)
# First token
if
ttft
==
0.0
:
ttft
=
timestamp
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
generated_text
+=
content
or
""
elif
usage
:
=
data
.
get
(
"usage"
):
output
.
output_tokens
=
usage
.
get
(
"completion_tokens"
)
most_recent_timestamp
=
timestamp
output
.
generated_text
=
generated_text
output
.
success
=
True
output
.
latency
=
most_recent_timestamp
-
st
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
def
get_model
(
pretrained_model_name_or_path
:
str
)
->
str
:
def
get_model
(
pretrained_model_name_or_path
:
str
)
->
str
:
if
os
.
getenv
(
'VLLM_USE_MODELSCOPE'
,
'False'
).
lower
()
==
'true'
:
if
os
.
getenv
(
'VLLM_USE_MODELSCOPE'
,
'False'
).
lower
()
==
'true'
:
from
modelscope
import
snapshot_download
from
modelscope
import
snapshot_download
...
@@ -493,6 +599,7 @@ ASYNC_REQUEST_FUNCS = {
...
@@ -493,6 +599,7 @@ ASYNC_REQUEST_FUNCS = {
"deepspeed-mii"
:
async_request_deepspeed_mii
,
"deepspeed-mii"
:
async_request_deepspeed_mii
,
"openai"
:
async_request_openai_completions
,
"openai"
:
async_request_openai_completions
,
"openai-chat"
:
async_request_openai_chat_completions
,
"openai-chat"
:
async_request_openai_chat_completions
,
"openai-audio"
:
async_request_openai_audio
,
"tensorrt-llm"
:
async_request_trt_llm
,
"tensorrt-llm"
:
async_request_trt_llm
,
"scalellm"
:
async_request_openai_completions
,
"scalellm"
:
async_request_openai_completions
,
"sglang"
:
async_request_openai_completions
,
"sglang"
:
async_request_openai_completions
,
...
...
Prev
1
2
3
4
5
6
…
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment