Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
415b817b
Commit
415b817b
authored
Sep 17, 2025
by
王敏
Browse files
merge 092-dev分支近期修改
parents
3c08fbc1
bc9aee38
Changes
66
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
293 additions
and
201 deletions
+293
-201
csrc/moe/moe_align_sum_kernels.cu
csrc/moe/moe_align_sum_kernels.cu
+90
-0
csrc/moe/moe_ops.h
csrc/moe/moe_ops.h
+1
-0
csrc/moe/torch_bindings.cpp
csrc/moe/torch_bindings.cpp
+2
-1
requirements/rocm.txt
requirements/rocm.txt
+1
-0
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+10
-11
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+2
-3
tests/core/test_num_computed_tokens_update.py
tests/core/test_num_computed_tokens_update.py
+1
-3
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+31
-30
tests/core/test_scheduler_encoder_decoder.py
tests/core/test_scheduler_encoder_decoder.py
+2
-1
tests/detokenizer/test_disable_detokenization.py
tests/detokenizer/test_disable_detokenization.py
+2
-3
tests/entrypoints/test_api_server_process_manager.py
tests/entrypoints/test_api_server_process_manager.py
+57
-56
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_chat_utils.py
+8
-8
tests/fastsafetensors_loader/untest_fastsafetensors_loader.py
...s/fastsafetensors_loader/untest_fastsafetensors_loader.py
+0
-0
tests/fastsafetensors_loader/untest_weight_utils.py
tests/fastsafetensors_loader/untest_weight_utils.py
+0
-0
tests/kernels/mamba/test_mamba_ssm_ssd.py
tests/kernels/mamba/test_mamba_ssm_ssd.py
+25
-25
tests/kernels/quantization/untest_block_int8.py
tests/kernels/quantization/untest_block_int8.py
+0
-0
tests/kernels/quantization/untest_triton_scaled_mm.py
tests/kernels/quantization/untest_triton_scaled_mm.py
+0
-0
tests/metrics/test_metrics.py
tests/metrics/test_metrics.py
+52
-51
tests/model_executor/test_enabled_custom_ops.py
tests/model_executor/test_enabled_custom_ops.py
+6
-6
tests/model_executor/test_weight_utils.py
tests/model_executor/test_weight_utils.py
+3
-3
No files found.
csrc/moe/moe_align_sum_kernels.cu
View file @
415b817b
...
...
@@ -173,6 +173,35 @@ __global__ void moe_sum_kernel(
}
}
template
<
typename
scalar_t
,
int
TOPK
,
int
SPLIT_D
,
int
BLOCK_DIM
>
__global__
void
moe_sum_sharedmem_topk8
(
scalar_t
*
__restrict__
out
,
const
scalar_t
*
__restrict__
input
,
const
int
d
)
{
const
int
token_idx
=
blockIdx
.
x
/
SPLIT_D
;
const
int
sub_block
=
blockIdx
.
x
%
SPLIT_D
;
const
int
d_per_block
=
(
d
+
SPLIT_D
-
1
)
/
SPLIT_D
;
const
int64_t
d_start
=
sub_block
*
d_per_block
;
const
int64_t
token_offset
=
token_idx
*
TOPK
*
d
;
const
int64_t
d_end
=
min
(
d_start
+
d_per_block
,
d
);
__shared__
__align__
(
16
)
scalar_t
sem_input
[
TOPK
][
BLOCK_DIM
];
for
(
int64_t
idx
=
d_start
+
threadIdx
.
x
;
idx
<
d_end
;
idx
+=
blockDim
.
x
)
{
sem_input
[
0
][
threadIdx
.
x
]
=
input
[
token_offset
+
0
*
d
+
idx
];
sem_input
[
1
][
threadIdx
.
x
]
=
input
[
token_offset
+
1
*
d
+
idx
];
sem_input
[
2
][
threadIdx
.
x
]
=
input
[
token_offset
+
2
*
d
+
idx
];
sem_input
[
3
][
threadIdx
.
x
]
=
input
[
token_offset
+
3
*
d
+
idx
];
sem_input
[
4
][
threadIdx
.
x
]
=
input
[
token_offset
+
4
*
d
+
idx
];
sem_input
[
5
][
threadIdx
.
x
]
=
input
[
token_offset
+
5
*
d
+
idx
];
sem_input
[
6
][
threadIdx
.
x
]
=
input
[
token_offset
+
6
*
d
+
idx
];
sem_input
[
7
][
threadIdx
.
x
]
=
input
[
token_offset
+
7
*
d
+
idx
];
__syncthreads
();
scalar_t
x
=
sem_input
[
0
][
threadIdx
.
x
]
+
sem_input
[
1
][
threadIdx
.
x
]
+
sem_input
[
2
][
threadIdx
.
x
]
+
sem_input
[
3
][
threadIdx
.
x
]
+
sem_input
[
4
][
threadIdx
.
x
]
+
sem_input
[
5
][
threadIdx
.
x
]
+
sem_input
[
6
][
threadIdx
.
x
]
+
sem_input
[
7
][
threadIdx
.
x
];
out
[
token_idx
*
d
+
idx
]
=
x
;
}
}
template
<
typename
scalar_t
>
__global__
void
moe_align_block_size_small_batch_expert_kernel
(
const
scalar_t
*
__restrict__
topk_ids
,
...
...
@@ -358,3 +387,64 @@ void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
break
;
}
}
void
moe_sum_opt1
(
torch
::
Tensor
&
input
,
// [num_tokens, topk, hidden_size]
torch
::
Tensor
&
output
)
// [num_tokens, hidden_size]
{
const
int
hidden_size
=
input
.
size
(
-
1
);
const
auto
num_tokens
=
output
.
numel
()
/
hidden_size
;
const
int
topk
=
input
.
size
(
1
);
dim3
grid
(
num_tokens
);
dim3
block
(
std
::
min
(
hidden_size
,
1024
));
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
output
));
const
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
constexpr
int
splitD_
=
8
;
const
int
TOPK8_GRID_DIM
=
num_tokens
*
splitD_
;
constexpr
int
TOPK8_BLOCK_DIM
=
256
;
dim3
grid_8
(
TOPK8_GRID_DIM
);
dim3
block_8
(
TOPK8_BLOCK_DIM
);
switch
(
topk
)
{
case
2
:
VLLM_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"moe_sum_kernel"
,
[
&
]
{
vllm
::
moe
::
moe_sum_kernel
<
scalar_t
,
2
><<<
grid
,
block
,
0
,
stream
>>>
(
output
.
data_ptr
<
scalar_t
>
(),
input
.
data_ptr
<
scalar_t
>
(),
hidden_size
);
});
break
;
case
3
:
VLLM_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"moe_sum_kernel"
,
[
&
]
{
vllm
::
moe
::
moe_sum_kernel
<
scalar_t
,
3
><<<
grid
,
block
,
0
,
stream
>>>
(
output
.
data_ptr
<
scalar_t
>
(),
input
.
data_ptr
<
scalar_t
>
(),
hidden_size
);
});
break
;
case
4
:
VLLM_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"moe_sum_kernel"
,
[
&
]
{
vllm
::
moe
::
moe_sum_kernel
<
scalar_t
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
output
.
data_ptr
<
scalar_t
>
(),
input
.
data_ptr
<
scalar_t
>
(),
hidden_size
);
});
break
;
case
8
:
VLLM_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"moe_sum_sharedmem_topk8"
,
[
&
]{
vllm
::
moe
::
moe_sum_sharedmem_topk8
<
scalar_t
,
8
,
splitD_
,
TOPK8_BLOCK_DIM
><<<
grid_8
,
block_8
,
0
,
stream
>>>
(
output
.
data_ptr
<
scalar_t
>
(),
input
.
data_ptr
<
scalar_t
>
(),
hidden_size
);
});
break
;
default:
at
::
sum_out
(
output
,
input
,
1
);
break
;
}
}
\ No newline at end of file
csrc/moe/moe_ops.h
View file @
415b817b
...
...
@@ -7,6 +7,7 @@ void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
torch
::
Tensor
&
gating_output
);
void
moe_sum
(
torch
::
Tensor
&
input
,
torch
::
Tensor
&
output
);
void
moe_sum_opt1
(
torch
::
Tensor
&
input
,
torch
::
Tensor
&
output
);
void
moe_align_block_size
(
torch
::
Tensor
topk_ids
,
int64_t
num_experts
,
int64_t
block_size
,
torch
::
Tensor
sorted_token_ids
,
...
...
csrc/moe/torch_bindings.cpp
View file @
415b817b
...
...
@@ -11,8 +11,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
// Calculate the result of moe by summing up the partial results
// from all selected experts.
m
.
def
(
"moe_sum(Tensor input, Tensor! output) -> ()"
);
m
.
def
(
"moe_sum_opt1(Tensor input, Tensor! output) -> ()"
);
m
.
impl
(
"moe_sum"
,
torch
::
kCUDA
,
&
moe_sum
);
m
.
impl
(
"moe_sum_opt1"
,
torch
::
kCUDA
,
&
moe_sum_opt1
);
// Aligning the number of tokens to be processed by each expert such
// that it is divisible by the block size.
m
.
def
(
...
...
requirements/rocm.txt
View file @
415b817b
...
...
@@ -24,6 +24,7 @@ torch == 2.5.1
triton == 3.0.0
flash_attn == 2.6.1
flash_mla == 1.0.0
lightop == 0.5.0
lmslim == 0.3.1
numa
python-multipart
...
...
tests/core/block/e2e/test_correctness.py
View file @
415b817b
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
from
itertools
import
cycle
import
pytest
...
...
@@ -8,10 +9,8 @@ import pytest
from
vllm
import
SamplingParams
from
.conftest
import
get_token_ids_from_llm_generator
import
os
from
....utils
import
models_path_prefix
import
vllm.envs
as
envs
from
vllm.utils
import
SUPPORT_TC
,
gpuname
from
vllm.platforms
import
current_platform
@
pytest
.
mark
.
parametrize
(
...
...
@@ -24,7 +23,7 @@ from vllm.utils import SUPPORT_TC, gpuname
"enforce_eager"
:
True
,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
...
...
@@ -107,7 +106,7 @@ def test_block_manager_with_preemption(baseline_llm_generator,
"per_test_common_llm_kwargs"
,
[
{
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 8 = 128/block_size
...
...
@@ -200,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"max_num_batched_tokens"
:
2
,
"max_num_seqs"
:
2
,
},
{
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"max_num_batched_tokens"
:
3
,
"max_num_seqs"
:
2
,
},
{
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"max_num_batched_tokens"
:
256
,
"max_num_seqs"
:
10
,
}])
...
...
@@ -274,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
"enforce_eager"
:
True
,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
# Enable prefill cache
...
...
@@ -355,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
"enforce_eager"
:
True
,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
...
...
@@ -430,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
# we keep the blocks small, so that hit eviction quickly
"max_model_len"
:
48
,
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"num_gpu_blocks_override"
:
3
,
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
415b817b
...
...
@@ -15,8 +15,7 @@ from vllm.sequence import Logprob, SequenceGroup
from
.utils
import
create_dummy_prompt
from
..utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
from
vllm.platforms
import
current_platform
def
get_sequence_groups
(
scheduler_output
):
...
...
@@ -852,7 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str,
max_num_seqs
=
8
,
enable_chunked_prefill
=
True
,
gpu_memory_utilization
=
0.8
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
...
...
tests/core/test_num_computed_tokens_update.py
View file @
415b817b
...
...
@@ -10,8 +10,6 @@ from vllm.engine.llm_engine import LLMEngine
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SequenceGroup
from
..utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-160m"
)
...
...
@@ -41,7 +39,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
num_scheduler_steps
=
num_scheduler_steps
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enforce_eager
=
enforce_eager
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
)
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
)
engine
:
LLMEngine
=
runner
.
model
.
llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step.
...
...
tests/core/test_scheduler.py
View file @
415b817b
...
...
@@ -15,6 +15,7 @@ from vllm.core.interfaces import AllocStatus
from
vllm.core.scheduler
import
Scheduler
,
SchedulingBudget
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
SequenceGroup
,
SequenceStatus
from
vllm.platforms
import
current_platform
from
.utils
import
(
append_new_token
,
append_new_token_seq
,
append_new_token_seq_group
,
create_dummy_prompt
,
...
...
@@ -22,7 +23,7 @@ from .utils import (append_new_token, append_new_token_seq,
def
test_scheduler_add_seq_group
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
=
100
,
...
...
@@ -45,7 +46,7 @@ def test_scheduler_add_seq_group():
def
test_scheduler_abort_seq_group
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
=
100
,
...
...
@@ -72,7 +73,7 @@ def test_scheduler_abort_seq_group():
def
test_scheduler_schedule_simple
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
num_seq_group
=
4
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
...
...
@@ -117,7 +118,7 @@ def test_scheduler_schedule_simple():
def
test_scheduler_prefill_prioritized
():
"""Verify running batched tokens are not applied to prefill requests."""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
max_model_len
=
30
max_batched_num_tokens
=
30
scheduler_config
=
SchedulerConfig
(
...
...
@@ -150,7 +151,7 @@ def test_scheduler_prefill_prioritized():
def
test_scheduler_schedule_preempt_abort
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
"generate"
,
...
...
@@ -208,7 +209,7 @@ def test_scheduler_schedule_preempt_abort():
def
test_scheduler_max_seqs
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
num_seq_group
=
4
max_seq_group
=
2
max_model_len
=
16
...
...
@@ -256,7 +257,7 @@ def test_scheduler_max_seqs():
def
test_scheduler_delay_factor
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
=
100
,
...
...
@@ -306,7 +307,7 @@ def initialize_scheduler(
max_token_budget
=
1000
,
max_model_len
=
1000
,
lora_config
=
None
,
block_size
=
4
,
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
,
num_cpu_blocks
=
8
,
num_gpu_blocks
=
8
,
enable_prefix_caching
=
False
,
...
...
@@ -354,7 +355,7 @@ def test_prefill_schedule_max_prompt_len():
"""
Test prompt longer than max_prompt_len is aborted.
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
max_model_len
=
30
,
block_size
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
60
,
...
...
@@ -374,7 +375,7 @@ def test_prefill_schedule_token_budget():
"""
Test token budget respected.
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
...
...
@@ -436,7 +437,7 @@ def test_prefill_schedule_max_seqs():
"""
Test max seq respected.
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
...
...
@@ -475,7 +476,7 @@ def test_prefill_schedule_max_lora():
"""
Test max lora is respected and prioritized.
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
block_size
=
block_size
,
...
...
@@ -528,7 +529,7 @@ def test_prefill_schedule_no_block_manager_capacity():
"""
Test sequence cannot be scheduled due to block manager has no capacity.
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_gpu_blocks
=
128
,
num_cpu_blocks
=
128
)
...
...
@@ -570,7 +571,7 @@ def test_decode_schedule_preempted():
"""
Test decodes cannot be scheduled and preempted.
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
...
...
@@ -614,7 +615,7 @@ def test_schedule_decode_blocks_to_copy_update():
"""
Verify blocks_to_copy is updated.
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
4
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
)
...
...
@@ -646,7 +647,7 @@ def test_schedule_decode_blocks_to_copy_update():
def
test_schedule_swapped_max_loras
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
block_size
=
block_size
,
...
...
@@ -679,7 +680,7 @@ def test_schedule_swapped_max_loras():
def
test_schedule_swapped_cannot_swap_in
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
...
...
@@ -709,7 +710,7 @@ def test_schedule_swapped_cannot_swap_in():
def
test_infeasible_swap
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
...
...
@@ -740,7 +741,7 @@ def test_infeasible_swap():
def
test_schedule_swapped_blocks_to_copy
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
...
...
@@ -825,7 +826,7 @@ def test_prefix_caching_aware_prefills(enable_prefix_caching):
considering prefix caching.
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
max_num_batched_tokens
=
12
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
...
...
@@ -912,7 +913,7 @@ def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
block-size aligned).
"""
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_num_batched_tokens
=
4
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
...
...
@@ -978,7 +979,7 @@ def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
Test that the scheduler does not schedule batches with prompt tokens and
prompt embeddings co-mingled.
"""
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
...
...
@@ -1057,7 +1058,7 @@ def test_remove_seq_from_computed_blocks_tracker():
_seq_id_to_num_tokens_computed.
"""
# Budget can not schedule in swapped
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
seq_tokens_with_swapped
:
list
[
list
[
int
]]
=
[]
blocks_to_swap_out
:
list
[
tuple
[
int
,
int
]]
=
[]
...
...
@@ -1097,7 +1098,7 @@ def test_remove_seq_from_computed_blocks_tracker():
# Prefill schedule don't have a space for another LoRA, so
# we ignore this request for now.
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
block_size
=
block_size
,
...
...
@@ -1131,7 +1132,7 @@ def test_remove_seq_from_computed_blocks_tracker():
# Prefill scheduler does not schedule batches with prompt tokens and
# prompt embeddings co-mingled.
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
...
...
@@ -1170,7 +1171,7 @@ def test_remove_seq_from_computed_blocks_tracker():
# Prefill scheduler budget num_batched_tokens
# >= scheduler_config max_num_batched_tokens
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
seq_tokens_prefill_budget
:
list
[
list
[
int
]]
=
[]
...
...
@@ -1205,7 +1206,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert
seq_id_to_num_tokens_computed
is
None
# Budget can not schedule in waiting
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
...
...
@@ -1241,7 +1242,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert
seq_id_to_num_tokens_computed
is
None
# Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
...
...
@@ -1269,7 +1270,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert
seq_id_to_num_tokens_computed
is
None
# Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
...
...
@@ -1303,7 +1304,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert
seq_id_to_num_tokens_computed
is
None
# Budget can not allocate, AllocStatus is LATER
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
...
...
tests/core/test_scheduler_encoder_decoder.py
View file @
415b817b
...
...
@@ -6,6 +6,7 @@ import pytest # noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.core.scheduler
import
Scheduler
from
vllm.sequence
import
SequenceGroup
from
vllm.platforms
import
current_platform
from
.utils
import
(
append_new_token
,
create_dummy_prompt_encoder_decoder
,
get_sequence_groups
,
schedule_and_update_computed_tokens
)
...
...
@@ -34,7 +35,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
cross-attention block table
'''
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
num_seq_group
=
4
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
...
...
tests/detokenizer/test_disable_detokenization.py
View file @
415b817b
...
...
@@ -7,8 +7,7 @@ import pytest
from
vllm.entrypoints.llm
import
LLM
from
vllm.sampling_params
import
SamplingParams
from
..utils
import
models_path_prefix
import
vllm.envs
as
envs
from
vllm.utils
import
SUPPORT_TC
,
gpuname
from
vllm.platforms
import
current_platform
@
pytest
.
mark
.
skip_v1
...
...
@@ -23,7 +22,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available "
"online for free?"
)
llm
=
LLM
(
model
=
model
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
)
llm
=
LLM
(
model
=
model
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
)
sampling_params
=
SamplingParams
(
max_tokens
=
10
,
temperature
=
0.0
,
detokenize
=
False
)
...
...
tests/entrypoints/test_api_server_process_manager.py
View file @
415b817b
...
...
@@ -95,62 +95,63 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update):
assert
not
proc
.
is_alive
()
@
patch
(
"vllm.entrypoints.cli.serve.run_api_server_worker"
,
mock_run_api_server_worker
)
def
test_wait_for_completion_or_failure
(
api_server_args
):
"""Test that wait_for_completion_or_failure works with failures."""
global
WORKER_RUNTIME_SECONDS
WORKER_RUNTIME_SECONDS
=
1.0
# Create the manager
manager
=
APIServerProcessManager
(
**
api_server_args
)
try
:
assert
len
(
manager
.
processes
)
==
3
# Create a result capture for the thread
result
:
dict
[
str
,
Optional
[
Exception
]]
=
{
"exception"
:
None
}
def
run_with_exception_capture
():
try
:
wait_for_completion_or_failure
(
api_server_manager
=
manager
)
except
Exception
as
e
:
result
[
"exception"
]
=
e
# Start a thread to run wait_for_completion_or_failure
wait_thread
=
threading
.
Thread
(
target
=
run_with_exception_capture
,
daemon
=
True
)
wait_thread
.
start
()
# Let all processes run for a short time
time
.
sleep
(
0.2
)
# All processes should still be running
assert
all
(
proc
.
is_alive
()
for
proc
in
manager
.
processes
)
# Now simulate a process failure
print
(
"Simulating process failure..."
)
manager
.
processes
[
0
].
terminate
()
# Wait for the wait_for_completion_or_failure
# to detect and handle the failure
# This should trigger it to terminate all other processes
wait_thread
.
join
(
timeout
=
1.0
)
# The wait thread should have exited
assert
not
wait_thread
.
is_alive
()
# Verify that an exception was raised with appropriate error message
assert
result
[
"exception"
]
is
not
None
assert
"died with exit code"
in
str
(
result
[
"exception"
])
# All processes should now be terminated
for
i
,
proc
in
enumerate
(
manager
.
processes
):
assert
not
proc
.
is_alive
(),
f
"Process
{
i
}
should not be alive"
finally
:
manager
.
close
()
time
.
sleep
(
0.2
)
# TODO
# @patch("vllm.entrypoints.cli.serve.run_api_server_worker",
# mock_run_api_server_worker)
# def test_wait_for_completion_or_failure(api_server_args):
# """Test that wait_for_completion_or_failure works with failures."""
# global WORKER_RUNTIME_SECONDS
# WORKER_RUNTIME_SECONDS = 1.0
# # Create the manager
# manager = APIServerProcessManager(**api_server_args)
# try:
# assert len(manager.processes) == 3
# # Create a result capture for the thread
# result: dict[str, Optional[Exception]] = {"exception": None}
# def run_with_exception_capture():
# try:
# wait_for_completion_or_failure(api_server_manager=manager)
# except Exception as e:
# result["exception"] = e
# # Start a thread to run wait_for_completion_or_failure
# wait_thread = threading.Thread(target=run_with_exception_capture,
# daemon=True)
# wait_thread.start()
# # Let all processes run for a short time
# time.sleep(0.2)
# # All processes should still be running
# assert all(proc.is_alive() for proc in manager.processes)
# # Now simulate a process failure
# print("Simulating process failure...")
# manager.processes[0].terminate()
# # Wait for the wait_for_completion_or_failure
# # to detect and handle the failure
# # This should trigger it to terminate all other processes
# wait_thread.join(timeout=1.0)
# # The wait thread should have exited
# assert not wait_thread.is_alive()
# # Verify that an exception was raised with appropriate error message
# assert result["exception"] is not None
# assert "died with exit code" in str(result["exception"])
# # All processes should now be terminated
# for i, proc in enumerate(manager.processes):
# assert not proc.is_alive(), f"Process {i} should not be alive"
# finally:
# manager.close()
# time.sleep(0.2)
@
pytest
.
mark
.
timeout
(
30
)
...
...
tests/entrypoints/test_chat_utils.py
View file @
415b817b
...
...
@@ -914,14 +914,14 @@ def test_resolve_content_format_hf_defined(model, expected_format):
# yapf: disable
@
pytest
.
mark
.
parametrize
(
(
"model"
,
"expected_format"
),
[(
"Salesforce/blip2-opt-2.7b"
,
"string"
),
(
"facebook/chameleon-7b"
,
"string"
),
(
"deepseek-ai/deepseek-vl2-tiny"
,
"string"
),
(
"microsoft/Florence-2-base"
,
"string"
),
(
"adept/fuyu-8b"
,
"string"
),
(
"google/paligemma-3b-mix-224"
,
"string"
),
(
"Qwen/Qwen-VL"
,
"string"
),
(
"Qwen/Qwen-VL-Chat"
,
"string"
)],
[(
os
.
path
.
join
(
models_path_prefix
,
"Salesforce/blip2-opt-2.7b"
)
,
"string"
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/chameleon-7b"
)
,
"string"
),
(
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/deepseek-vl2-tiny"
)
,
"string"
),
(
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Florence-2-base"
)
,
"string"
),
(
os
.
path
.
join
(
models_path_prefix
,
"adept/fuyu-8b"
)
,
"string"
),
(
os
.
path
.
join
(
models_path_prefix
,
"google/paligemma-3b-mix-224"
)
,
"string"
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL"
)
,
"string"
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL-Chat"
)
,
"string"
)],
)
# yapf: enable
def
test_resolve_content_format_fallbacks
(
model
,
expected_format
):
...
...
tests/fastsafetensors_loader/test_fastsafetensors_loader.py
→
tests/fastsafetensors_loader/
un
test_fastsafetensors_loader.py
View file @
415b817b
File moved
tests/fastsafetensors_loader/test_weight_utils.py
→
tests/fastsafetensors_loader/
un
test_weight_utils.py
View file @
415b817b
File moved
tests/kernels/mamba/test_mamba_ssm_ssd.py
View file @
415b817b
...
...
@@ -230,31 +230,31 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
# @pytest.mark.parametrize("n_heads", [4, 8, 13])
# @pytest.mark.parametrize("d_head", [5, 16, 21, 32])
# @pytest.mark.parametrize(
"seq_len_chunk_size_cases"
,
[
# small-ish chunk_size (8)
(
64
,
8
,
2
,
[(
64
,
32
),
(
64
,
32
)]),
(
64
,
8
,
2
,
[(
32
,
32
),
(
32
,
32
),
(
32
,
32
)]),
(
64
,
8
,
2
,
[(
8
,
8
),
(
8
,
8
),
(
8
,
8
)]),
# chunk size boundary
(
64
,
8
,
2
,
[(
4
,
4
),
(
4
,
4
),
(
4
,
4
),
(
4
,
4
)]),
# chunk_size larger than cont batches
(
64
,
8
,
5
,
[
(
64
,
32
,
16
,
8
,
8
),
(
8
,
16
,
32
,
16
,
8
),
(
8
,
8
,
16
,
32
,
16
),
]),
# mode examples with varied lengths
# odd chunk_size
(
64
,
29
,
2
,
[(
11
,
4
),
(
13
,
23
),
(
19
,
22
),
(
21
,
15
)]),
# irregular sizes
# large-ish chunk_size (256)
(
64
,
256
,
1
,
[(
5
,
),
(
1
,
),
(
1
,
),
(
1
,
)]),
# irregular sizes with small sequences
(
64
,
256
,
2
,
[(
5
,
30
),
(
1
,
2
),
(
1
,
2
),
(
1
,
2
)]),
# irregular sizes with small sequences
]
)
#
"seq_len_chunk_size_cases",
#
[
#
# small-ish chunk_size (8)
#
(64, 8, 2, [(64, 32), (64, 32)]),
#
(64, 8, 2, [(32, 32), (32, 32), (32, 32)]),
#
(64, 8, 2, [(8, 8), (8, 8), (8, 8)]), # chunk size boundary
#
(64, 8, 2, [(4, 4), (4, 4), (4, 4),
#
(4, 4)]), # chunk_size larger than cont batches
#
(64, 8, 5, [
#
(64, 32, 16, 8, 8),
#
(8, 16, 32, 16, 8),
#
(8, 8, 16, 32, 16),
#
]), # mode examples with varied lengths
#
# odd chunk_size
#
(64, 29, 2, [(11, 4), (13, 23), (19, 22),
#
(21, 15)]), # irregular sizes
#
# large-ish chunk_size (256)
#
(64, 256, 1, [(5, ), (1, ), (1, ),
#
(1, )]), # irregular sizes with small sequences
#
(64, 256, 2, [(5, 30), (1, 2), (1, 2),
#
(1, 2)]), # irregular sizes with small sequences
#
]
# def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
# itype):
...
...
tests/kernels/quantization/test_block_int8.py
→
tests/kernels/quantization/
un
test_block_int8.py
View file @
415b817b
File moved
tests/kernels/quantization/test_triton_scaled_mm.py
→
tests/kernels/quantization/
un
test_triton_scaled_mm.py
View file @
415b817b
File moved
tests/metrics/test_metrics.py
View file @
415b817b
...
...
@@ -291,7 +291,7 @@ def test_metric_spec_decode(
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"log_interval"
,
[
1
,
3
,
5
,
7
])
@
pytest
.
mark
.
parametrize
(
"log_interval"
,
[
1
,
3
,
5
])
# 7
def
test_metric_spec_decode_interval
(
vllm_runner
,
example_prompts
,
...
...
@@ -405,53 +405,54 @@ def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
metric_value
==
num_requests
),
"Metrics should be collected"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
16
])
def
test_engine_log_metrics_ray
(
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
# This test is quite weak - it only checks that we can use
# RayPrometheusStatLogger without exceptions.
# Checking whether the metrics are actually emitted is unfortunately
# non-trivial.
# We have to run in a Ray task for Ray metrics to be emitted correctly
@
ray
.
remote
(
num_gpus
=
1
)
def
_inner
():
class
_RayPrometheusStatLogger
(
RayPrometheusStatLogger
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
_i
=
0
super
().
__init__
(
*
args
,
**
kwargs
)
def
log
(
self
,
*
args
,
**
kwargs
):
self
.
_i
+=
1
return
super
().
log
(
*
args
,
**
kwargs
)
engine_args
=
EngineArgs
(
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
logger
=
_RayPrometheusStatLogger
(
local_interval
=
0.5
,
labels
=
dict
(
model_name
=
engine
.
model_config
.
served_model_name
),
vllm_config
=
engine
.
vllm_config
)
engine
.
add_logger
(
"ray"
,
logger
)
for
i
,
prompt
in
enumerate
(
example_prompts
):
engine
.
add_request
(
f
"request-id-
{
i
}
"
,
prompt
,
SamplingParams
(
max_tokens
=
max_tokens
),
)
while
engine
.
has_unfinished_requests
():
engine
.
step
()
assert
logger
.
_i
>
0
,
".log must be called at least once"
ray
.
get
(
_inner
.
remote
())
# TODO
# @pytest.mark.parametrize("model", MODELS)
# @pytest.mark.parametrize("dtype", ["half"])
# @pytest.mark.parametrize("max_tokens", [16])
# def test_engine_log_metrics_ray(
# example_prompts,
# model: str,
# dtype: str,
# max_tokens: int,
# ) -> None:
# # This test is quite weak - it only checks that we can use
# # RayPrometheusStatLogger without exceptions.
# # Checking whether the metrics are actually emitted is unfortunately
# # non-trivial.
# # We have to run in a Ray task for Ray metrics to be emitted correctly
# @ray.remote(num_gpus=1)
# def _inner():
# class _RayPrometheusStatLogger(RayPrometheusStatLogger):
# def __init__(self, *args, **kwargs):
# self._i = 0
# super().__init__(*args, **kwargs)
# def log(self, *args, **kwargs):
# self._i += 1
# return super().log(*args, **kwargs)
# engine_args = EngineArgs(
# model=model,
# dtype=dtype,
# disable_log_stats=False,
# )
# engine = LLMEngine.from_engine_args(engine_args)
# logger = _RayPrometheusStatLogger(
# local_interval=0.5,
# labels=dict(model_name=engine.model_config.served_model_name),
# vllm_config=engine.vllm_config)
# engine.add_logger("ray", logger)
# for i, prompt in enumerate(example_prompts):
# engine.add_request(
# f"request-id-{i}",
# prompt,
# SamplingParams(max_tokens=max_tokens),
# )
# while engine.has_unfinished_requests():
# engine.step()
# assert logger._i > 0, ".log must be called at least once"
# ray.get(_inner.remote())
tests/model_executor/test_enabled_custom_ops.py
View file @
415b817b
...
...
@@ -140,11 +140,11 @@ def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
use_rocm_aiter
)
topk_func
=
dispatch_topk_func
()
is_rocm_aiter_moe_enabled
.
cache_clear
()
if
current_platform
.
is_rocm
()
and
int
(
use_rocm_aiter
):
from
vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe
import
(
rocm_aiter_topk_softmax
)
assert
topk_func
==
rocm_aiter_topk_softmax
else
:
#
if current_platform.is_rocm() and int(use_rocm_aiter):
#
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
#
rocm_aiter_topk_softmax)
#
assert topk_func == rocm_aiter_topk_softmax
#
else:
assert
topk_func
==
vllm_topk_softmax
...
...
tests/model_executor/test_weight_utils.py
View file @
415b817b
...
...
@@ -35,20 +35,20 @@ def test_download_weights_from_hf():
# if offline is set and model is not cached
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
=
True
with
pytest
.
raises
(
LocalEntryNotFoundError
):
download_weights_from_hf
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
download_weights_from_hf
(
"facebook/opt-125m"
,
allow_patterns
=
[
"*.safetensors"
,
"*.bin"
],
cache_dir
=
tmpdir
)
# download the model
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
=
False
download_weights_from_hf
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
download_weights_from_hf
(
"facebook/opt-125m"
,
allow_patterns
=
[
"*.safetensors"
,
"*.bin"
],
cache_dir
=
tmpdir
)
# now it should work offline
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
=
True
assert
download_weights_from_hf
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
"facebook/opt-125m"
,
allow_patterns
=
[
"*.safetensors"
,
"*.bin"
],
cache_dir
=
tmpdir
)
is
not
None
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment