Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
48a9e546
Commit
48a9e546
authored
Sep 07, 2025
by
王敏
Browse files
Merge remote-tracking branch 'origin/v0.9.2-dev' into v0.9.2-dev
parents
6372a1f3
c11b09df
Changes
98
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
246 additions
and
117 deletions
+246
-117
csrc/moe/moe_align_sum_kernels.cu
csrc/moe/moe_align_sum_kernels.cu
+90
-0
csrc/moe/moe_ops.h
csrc/moe/moe_ops.h
+1
-0
csrc/moe/torch_bindings.cpp
csrc/moe/torch_bindings.cpp
+2
-1
setup.py
setup.py
+2
-2
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+7
-9
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+17
-14
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+55
-25
tests/compile/test_async_tp.py
tests/compile/test_async_tp.py
+3
-1
tests/compile/test_basic_correctness.py
tests/compile/test_basic_correctness.py
+10
-9
tests/compile/test_config.py
tests/compile/test_config.py
+6
-3
tests/compile/untest_fusion_attn.py
tests/compile/untest_fusion_attn.py
+0
-0
tests/compile/untest_silu_mul_quant_fusion.py
tests/compile/untest_silu_mul_quant_fusion.py
+0
-0
tests/config/test_mp_reducer.py
tests/config/test_mp_reducer.py
+3
-1
tests/conftest.py
tests/conftest.py
+2
-1
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+10
-11
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+2
-3
tests/core/test_num_computed_tokens_update.py
tests/core/test_num_computed_tokens_update.py
+1
-3
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+31
-30
tests/core/test_scheduler_encoder_decoder.py
tests/core/test_scheduler_encoder_decoder.py
+2
-1
tests/detokenizer/test_disable_detokenization.py
tests/detokenizer/test_disable_detokenization.py
+2
-3
No files found.
csrc/moe/moe_align_sum_kernels.cu
View file @
48a9e546
...
@@ -173,6 +173,35 @@ __global__ void moe_sum_kernel(
...
@@ -173,6 +173,35 @@ __global__ void moe_sum_kernel(
}
}
}
}
template
<
typename
scalar_t
,
int
TOPK
,
int
SPLIT_D
,
int
BLOCK_DIM
>
__global__
void
moe_sum_sharedmem_topk8
(
scalar_t
*
__restrict__
out
,
const
scalar_t
*
__restrict__
input
,
const
int
d
)
{
const
int
token_idx
=
blockIdx
.
x
/
SPLIT_D
;
const
int
sub_block
=
blockIdx
.
x
%
SPLIT_D
;
const
int
d_per_block
=
(
d
+
SPLIT_D
-
1
)
/
SPLIT_D
;
const
int64_t
d_start
=
sub_block
*
d_per_block
;
const
int64_t
token_offset
=
token_idx
*
TOPK
*
d
;
const
int64_t
d_end
=
min
(
d_start
+
d_per_block
,
d
);
__shared__
__align__
(
16
)
scalar_t
sem_input
[
TOPK
][
BLOCK_DIM
];
for
(
int64_t
idx
=
d_start
+
threadIdx
.
x
;
idx
<
d_end
;
idx
+=
blockDim
.
x
)
{
sem_input
[
0
][
threadIdx
.
x
]
=
input
[
token_offset
+
0
*
d
+
idx
];
sem_input
[
1
][
threadIdx
.
x
]
=
input
[
token_offset
+
1
*
d
+
idx
];
sem_input
[
2
][
threadIdx
.
x
]
=
input
[
token_offset
+
2
*
d
+
idx
];
sem_input
[
3
][
threadIdx
.
x
]
=
input
[
token_offset
+
3
*
d
+
idx
];
sem_input
[
4
][
threadIdx
.
x
]
=
input
[
token_offset
+
4
*
d
+
idx
];
sem_input
[
5
][
threadIdx
.
x
]
=
input
[
token_offset
+
5
*
d
+
idx
];
sem_input
[
6
][
threadIdx
.
x
]
=
input
[
token_offset
+
6
*
d
+
idx
];
sem_input
[
7
][
threadIdx
.
x
]
=
input
[
token_offset
+
7
*
d
+
idx
];
__syncthreads
();
scalar_t
x
=
sem_input
[
0
][
threadIdx
.
x
]
+
sem_input
[
1
][
threadIdx
.
x
]
+
sem_input
[
2
][
threadIdx
.
x
]
+
sem_input
[
3
][
threadIdx
.
x
]
+
sem_input
[
4
][
threadIdx
.
x
]
+
sem_input
[
5
][
threadIdx
.
x
]
+
sem_input
[
6
][
threadIdx
.
x
]
+
sem_input
[
7
][
threadIdx
.
x
];
out
[
token_idx
*
d
+
idx
]
=
x
;
}
}
template
<
typename
scalar_t
>
template
<
typename
scalar_t
>
__global__
void
moe_align_block_size_small_batch_expert_kernel
(
__global__
void
moe_align_block_size_small_batch_expert_kernel
(
const
scalar_t
*
__restrict__
topk_ids
,
const
scalar_t
*
__restrict__
topk_ids
,
...
@@ -353,6 +382,67 @@ void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
...
@@ -353,6 +382,67 @@ void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
});
});
break
;
break
;
default:
at
::
sum_out
(
output
,
input
,
1
);
break
;
}
}
void
moe_sum_opt1
(
torch
::
Tensor
&
input
,
// [num_tokens, topk, hidden_size]
torch
::
Tensor
&
output
)
// [num_tokens, hidden_size]
{
const
int
hidden_size
=
input
.
size
(
-
1
);
const
auto
num_tokens
=
output
.
numel
()
/
hidden_size
;
const
int
topk
=
input
.
size
(
1
);
dim3
grid
(
num_tokens
);
dim3
block
(
std
::
min
(
hidden_size
,
1024
));
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
output
));
const
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
constexpr
int
splitD_
=
8
;
const
int
TOPK8_GRID_DIM
=
num_tokens
*
splitD_
;
constexpr
int
TOPK8_BLOCK_DIM
=
256
;
dim3
grid_8
(
TOPK8_GRID_DIM
);
dim3
block_8
(
TOPK8_BLOCK_DIM
);
switch
(
topk
)
{
case
2
:
VLLM_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"moe_sum_kernel"
,
[
&
]
{
vllm
::
moe
::
moe_sum_kernel
<
scalar_t
,
2
><<<
grid
,
block
,
0
,
stream
>>>
(
output
.
data_ptr
<
scalar_t
>
(),
input
.
data_ptr
<
scalar_t
>
(),
hidden_size
);
});
break
;
case
3
:
VLLM_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"moe_sum_kernel"
,
[
&
]
{
vllm
::
moe
::
moe_sum_kernel
<
scalar_t
,
3
><<<
grid
,
block
,
0
,
stream
>>>
(
output
.
data_ptr
<
scalar_t
>
(),
input
.
data_ptr
<
scalar_t
>
(),
hidden_size
);
});
break
;
case
4
:
VLLM_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"moe_sum_kernel"
,
[
&
]
{
vllm
::
moe
::
moe_sum_kernel
<
scalar_t
,
4
><<<
grid
,
block
,
0
,
stream
>>>
(
output
.
data_ptr
<
scalar_t
>
(),
input
.
data_ptr
<
scalar_t
>
(),
hidden_size
);
});
break
;
case
8
:
VLLM_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"moe_sum_sharedmem_topk8"
,
[
&
]{
vllm
::
moe
::
moe_sum_sharedmem_topk8
<
scalar_t
,
8
,
splitD_
,
TOPK8_BLOCK_DIM
><<<
grid_8
,
block_8
,
0
,
stream
>>>
(
output
.
data_ptr
<
scalar_t
>
(),
input
.
data_ptr
<
scalar_t
>
(),
hidden_size
);
});
break
;
default:
default:
at
::
sum_out
(
output
,
input
,
1
);
at
::
sum_out
(
output
,
input
,
1
);
break
;
break
;
...
...
csrc/moe/moe_ops.h
View file @
48a9e546
...
@@ -7,6 +7,7 @@ void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
...
@@ -7,6 +7,7 @@ void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
torch
::
Tensor
&
gating_output
);
torch
::
Tensor
&
gating_output
);
void
moe_sum
(
torch
::
Tensor
&
input
,
torch
::
Tensor
&
output
);
void
moe_sum
(
torch
::
Tensor
&
input
,
torch
::
Tensor
&
output
);
void
moe_sum_opt1
(
torch
::
Tensor
&
input
,
torch
::
Tensor
&
output
);
void
moe_align_block_size
(
torch
::
Tensor
topk_ids
,
int64_t
num_experts
,
void
moe_align_block_size
(
torch
::
Tensor
topk_ids
,
int64_t
num_experts
,
int64_t
block_size
,
torch
::
Tensor
sorted_token_ids
,
int64_t
block_size
,
torch
::
Tensor
sorted_token_ids
,
...
...
csrc/moe/torch_bindings.cpp
View file @
48a9e546
...
@@ -11,8 +11,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
...
@@ -11,8 +11,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
// Calculate the result of moe by summing up the partial results
// Calculate the result of moe by summing up the partial results
// from all selected experts.
// from all selected experts.
m
.
def
(
"moe_sum(Tensor input, Tensor! output) -> ()"
);
m
.
def
(
"moe_sum(Tensor input, Tensor! output) -> ()"
);
m
.
def
(
"moe_sum_opt1(Tensor input, Tensor! output) -> ()"
);
m
.
impl
(
"moe_sum"
,
torch
::
kCUDA
,
&
moe_sum
);
m
.
impl
(
"moe_sum"
,
torch
::
kCUDA
,
&
moe_sum
);
m
.
impl
(
"moe_sum_opt1"
,
torch
::
kCUDA
,
&
moe_sum_opt1
);
// Aligning the number of tokens to be processed by each expert such
// Aligning the number of tokens to be processed by each expert such
// that it is divisible by the block size.
// that it is divisible by the block size.
m
.
def
(
m
.
def
(
...
...
setup.py
View file @
48a9e546
...
@@ -559,10 +559,10 @@ def get_version_add(sha: Optional[str] = None) -> str:
...
@@ -559,10 +559,10 @@ def get_version_add(sha: Optional[str] = None) -> str:
if
sha
is
None
:
if
sha
is
None
:
sha
=
get_sha
(
vllm_root
)
sha
=
get_sha
(
vllm_root
)
if
(
major
,
minor
)
>=
(
'2'
,
'5'
):
if
(
major
,
minor
)
>=
(
'2'
,
'5'
):
version
=
'das.opt1.'
+
sha
[:
7
]
version
=
'das.opt1.
rc1.
'
+
sha
[:
7
]
else
:
else
:
if
(
major
,
minor
)
>=
(
'2'
,
'5'
):
if
(
major
,
minor
)
>=
(
'2'
,
'5'
):
version
=
'das.opt1'
version
=
'das.opt1
.rc1
'
# dtk version
# dtk version
...
...
tests/basic_correctness/test_basic_correctness.py
View file @
48a9e546
...
@@ -20,8 +20,6 @@ from ..models.utils import check_outputs_equal
...
@@ -20,8 +20,6 @@ from ..models.utils import check_outputs_equal
from
..utils
import
multi_gpu_test
from
..utils
import
multi_gpu_test
import
os
import
os
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
vllm.utils
import
gpuname
import
vllm.envs
as
envs
MODELS
=
[
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-2b-it"
),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-2b-it"
),
...
@@ -41,10 +39,10 @@ def v1(run_with_both_engines):
...
@@ -41,10 +39,10 @@ def v1(run_with_both_engines):
def
test_vllm_gc_ed
():
def
test_vllm_gc_ed
():
"""Verify vllm instance is GC'ed when it is deleted"""
"""Verify vllm instance is GC'ed when it is deleted"""
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
:
if
not
current_platform
.
is_rocm
():
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
block_size
=
64
)
else
:
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
))
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
))
else
:
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
block_size
=
64
)
weak_llm
=
weakref
.
ref
(
llm
)
weak_llm
=
weakref
.
ref
(
llm
)
del
llm
del
llm
...
@@ -111,13 +109,12 @@ def test_models(
...
@@ -111,13 +109,12 @@ def test_models(
prompt_embeds
=
hf_model
.
get_prompt_embeddings
(
prompt_embeds
=
hf_model
.
get_prompt_embeddings
(
example_prompts
)
example_prompts
)
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
:
if
not
current_platform
.
is_rocm
()
:
with
VllmRunner
(
model
,
with
VllmRunner
(
model
,
max_model_len
=
8192
,
max_model_len
=
8192
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
enable_prompt_embeds
=
enable_prompt_embeds
,
enable_prompt_embeds
=
enable_prompt_embeds
,
gpu_memory_utilization
=
0.7
,
gpu_memory_utilization
=
0.7
)
as
vllm_model
:
block_size
=
64
)
as
vllm_model
:
if
enable_prompt_embeds
:
if
enable_prompt_embeds
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompt_embeds
,
max_tokens
)
prompt_embeds
,
max_tokens
)
...
@@ -131,7 +128,8 @@ def test_models(
...
@@ -131,7 +128,8 @@ def test_models(
max_model_len
=
8192
,
max_model_len
=
8192
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
enable_prompt_embeds
=
enable_prompt_embeds
,
enable_prompt_embeds
=
enable_prompt_embeds
,
gpu_memory_utilization
=
0.7
)
as
vllm_model
:
gpu_memory_utilization
=
0.7
,
block_size
=
64
)
as
vllm_model
:
if
enable_prompt_embeds
:
if
enable_prompt_embeds
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_outputs
=
vllm_model
.
generate_greedy
(
prompt_embeds
,
max_tokens
)
prompt_embeds
,
max_tokens
)
...
...
tests/basic_correctness/test_chunked_prefill.py
View file @
48a9e546
...
@@ -94,7 +94,7 @@ def test_models(
...
@@ -94,7 +94,7 @@ def test_models(
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
max_tokens
)
...
@@ -128,7 +128,7 @@ def test_models_distributed(
...
@@ -128,7 +128,7 @@ def test_models_distributed(
)
->
None
:
)
->
None
:
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
if
(
model
==
"meta-llama/Llama-3.2-1B-Instruct"
if
(
model
==
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
and
distributed_executor_backend
==
"ray"
):
and
distributed_executor_backend
==
"ray"
):
# test Ray Compiled Graph
# test Ray Compiled Graph
m
.
setenv
(
"VLLM_USE_RAY_SPMD_WORKER"
,
"1"
)
m
.
setenv
(
"VLLM_USE_RAY_SPMD_WORKER"
,
"1"
)
...
@@ -158,7 +158,7 @@ def test_models_distributed(
...
@@ -158,7 +158,7 @@ def test_models_distributed(
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
distributed_executor_backend
=
distributed_executor_backend
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
example_prompts
,
...
@@ -220,22 +220,25 @@ def test_models_with_fp8_kv_cache(
...
@@ -220,22 +220,25 @@ def test_models_with_fp8_kv_cache(
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
kv_cache_dtype
=
kv_cache_dtype
,
disable_async_output_proc
=
disable_async_output_proc
,
disable_async_output_proc
=
disable_async_output_proc
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
no_chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
no_chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
with
vllm_runner
(
with
vllm_runner
(
model
,
model
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
True
,
enable_chunked_prefill
=
True
,
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
kv_cache_dtype
=
kv_cache_dtype
,
disable_async_output_proc
=
disable_async_output_proc
,
disable_async_output_proc
=
disable_async_output_proc
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
no_chunked_prefill_outputs
,
outputs_0_lst
=
no_chunked_prefill_outputs
,
...
@@ -286,7 +289,7 @@ def test_with_prefix_caching(
...
@@ -286,7 +289,7 @@ def test_with_prefix_caching(
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
outputs
[
enable
]
=
[]
outputs
[
enable
]
=
[]
for
prompt
in
full_prompts
:
for
prompt
in
full_prompts
:
...
@@ -303,7 +306,7 @@ def test_with_prefix_caching(
...
@@ -303,7 +306,7 @@ def test_with_prefix_caching(
)
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
,
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
,
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
1
,
4
,
16
])
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
1
,
4
,
16
])
...
...
tests/basic_correctness/test_preemption.py
View file @
48a9e546
...
@@ -7,6 +7,7 @@ VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
...
@@ -7,6 +7,7 @@ VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
pytest tests/basic_correctness/test_preemption.py`.
pytest tests/basic_correctness/test_preemption.py`.
"""
"""
import
os
import
pytest
import
pytest
from
prometheus_client
import
REGISTRY
from
prometheus_client
import
REGISTRY
...
@@ -18,7 +19,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
...
@@ -18,7 +19,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
from
..models.utils
import
check_outputs_equal
from
..models.utils
import
check_outputs_equal
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
import
os
from
vllm.platforms
import
current_platform
MODELS
=
[
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
...
@@ -74,18 +75,33 @@ def test_chunked_prefill_recompute(
...
@@ -74,18 +75,33 @@ def test_chunked_prefill_recompute(
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
vllm_runner
(
if
not
current_platform
.
is_rocm
():
model
,
with
vllm_runner
(
dtype
=
dtype
,
model
,
max_num_batched_tokens
=
max_num_batched_tokens
,
dtype
=
dtype
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_seqs
=
max_num_seqs
,
enable_chunked_prefill
=
enable_chunked_prefill
,
distributed_executor_backend
=
distributed_executor_backend
,
max_num_seqs
=
max_num_seqs
,
disable_log_stats
=
False
,
distributed_executor_backend
=
distributed_executor_backend
,
)
as
vllm_model
:
disable_log_stats
=
False
,
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
)
as
vllm_model
:
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
else
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_seqs
=
max_num_seqs
,
distributed_executor_backend
=
distributed_executor_backend
,
disable_log_stats
=
False
,
block_size
=
64
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
for
i
in
range
(
len
(
example_prompts
)):
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
...
@@ -115,17 +131,31 @@ def test_preemption(
...
@@ -115,17 +131,31 @@ def test_preemption(
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
vllm_runner
(
if
not
current_platform
.
is_rocm
():
model
,
with
vllm_runner
(
dtype
=
dtype
,
model
,
disable_log_stats
=
False
,
dtype
=
dtype
,
distributed_executor_backend
=
distributed_executor_backend
,
disable_log_stats
=
False
,
)
as
vllm_model
:
distributed_executor_backend
=
distributed_executor_backend
,
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
)
as
vllm_model
:
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
total_preemption
=
(
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
num_cumulative_preemption
)
total_preemption
=
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
num_cumulative_preemption
)
else
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
distributed_executor_backend
=
distributed_executor_backend
,
block_size
=
64
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
total_preemption
=
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
num_cumulative_preemption
)
check_outputs_equal
(
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_0_lst
=
hf_outputs
,
...
@@ -163,7 +193,7 @@ def test_preemption_infeasible(
...
@@ -163,7 +193,7 @@ def test_preemption_infeasible(
distributed_executor_backend
:
str
,
distributed_executor_backend
:
str
,
)
->
None
:
)
->
None
:
"""Verify infeasible preemption request will be ignored."""
"""Verify infeasible preemption request will be ignored."""
BLOCK_SIZE
=
16
BLOCK_SIZE
=
16
if
not
current_platform
.
is_rocm
()
else
64
prefill_blocks
=
2
prefill_blocks
=
2
decode_blocks
=
max_tokens
//
BLOCK_SIZE
decode_blocks
=
max_tokens
//
BLOCK_SIZE
with
vllm_runner
(
with
vllm_runner
(
...
...
tests/compile/test_async_tp.py
View file @
48a9e546
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
json
import
json
import
pytest
import
pytest
...
@@ -21,6 +22,7 @@ from ..models.registry import HF_EXAMPLE_MODELS
...
@@ -21,6 +22,7 @@ from ..models.registry import HF_EXAMPLE_MODELS
from
..utils
import
(
compare_two_settings
,
create_new_process_for_each_test
,
from
..utils
import
(
compare_two_settings
,
create_new_process_for_each_test
,
multi_gpu_test
)
multi_gpu_test
)
from
.backend
import
TestBackend
from
.backend
import
TestBackend
from
..utils
import
models_path_prefix
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -177,7 +179,7 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
...
@@ -177,7 +179,7 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
@
create_new_process_for_each_test
()
@
create_new_process_for_each_test
()
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"meta-llama/Llama-3.2-1B-Instruct"
])
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"async_tp_enabled"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"async_tp_enabled"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"distributed_backend"
,
[
"mp"
])
@
pytest
.
mark
.
parametrize
(
"distributed_backend"
,
[
"mp"
])
...
...
tests/compile/test_basic_correctness.py
View file @
48a9e546
...
@@ -84,16 +84,17 @@ class TestSetting:
...
@@ -84,16 +84,17 @@ class TestSetting:
# method="encode",
# method="encode",
# fullgraph=True,
# fullgraph=True,
# ),
# ),
# TODO
# vision language model
# vision language model
TestSetting
(
#
TestSetting(
model
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
),
#
model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
model_args
=
[
"--trust-remote-code"
,
"--max-model-len"
,
"2048"
],
#
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size
=
2
,
#
pp_size=2,
tp_size
=
1
,
#
tp_size=1,
attn_backend
=
"FLASH_ATTN"
,
#
attn_backend="FLASH_ATTN",
method
=
"generate_with_image"
,
#
method="generate_with_image",
fullgraph
=
False
,
#
fullgraph=False,
),
#
),
])
])
def
test_compile_correctness
(
def
test_compile_correctness
(
monkeypatch
:
pytest
.
MonkeyPatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
...
...
tests/compile/test_config.py
View file @
48a9e546
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
import
pytest
import
vllm
import
vllm
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.counter
import
compilation_counter
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.utils
import
_is_torch_equal_or_newer
from
vllm.utils
import
_is_torch_equal_or_newer
from
..utils
import
models_path_prefix
def
test_version
():
def
test_version
():
assert
_is_torch_equal_or_newer
(
'2.8.0.dev20250624+cu128'
,
'2.8.0.dev'
)
assert
_is_torch_equal_or_newer
(
'2.8.0.dev20250624+cu128'
,
'2.8.0.dev'
)
...
@@ -26,7 +27,9 @@ def test_use_cudagraphs_dynamic(monkeypatch):
...
@@ -26,7 +27,9 @@ def test_use_cudagraphs_dynamic(monkeypatch):
assert
not
vllm_config
.
compilation_config
.
use_cudagraph
assert
not
vllm_config
.
compilation_config
.
use_cudagraph
@
pytest
.
mark
.
parametrize
(
"enabled"
,
[
True
,
False
])
# TODO: when True num_cudagraph_captured=13
# @pytest.mark.parametrize("enabled", [True, False])
@
pytest
.
mark
.
parametrize
(
"enabled"
,
[
False
])
def
test_use_cudagraphs
(
vllm_runner
,
monkeypatch
,
enabled
):
def
test_use_cudagraphs
(
vllm_runner
,
monkeypatch
,
enabled
):
assert
vllm
.
envs
.
VLLM_USE_V1
assert
vllm
.
envs
.
VLLM_USE_V1
...
@@ -44,7 +47,7 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
...
@@ -44,7 +47,7 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
num_cudagraph_captured
=
13
if
enabled
else
0
,
num_cudagraph_captured
=
13
if
enabled
else
0
,
),
),
# loading the model causes compilation (if enabled) to happen
# loading the model causes compilation (if enabled) to happen
vllm_runner
(
'facebook/opt-125m'
,
vllm_runner
(
os
.
path
.
join
(
models_path_prefix
,
'facebook/opt-125m'
)
,
compilation_config
=
compilation_config
,
compilation_config
=
compilation_config
,
gpu_memory_utilization
=
0.4
)
as
_
):
gpu_memory_utilization
=
0.4
)
as
_
):
pass
pass
tests/compile/test_fusion_attn.py
→
tests/compile/
un
test_fusion_attn.py
View file @
48a9e546
File moved
tests/compile/test_silu_mul_quant_fusion.py
→
tests/compile/
un
test_silu_mul_quant_fusion.py
View file @
48a9e546
File moved
tests/config/test_mp_reducer.py
View file @
48a9e546
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
sys
import
sys
from
unittest.mock
import
patch
from
unittest.mock
import
patch
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
..utils
import
models_path_prefix
def
test_mp_reducer
(
monkeypatch
):
def
test_mp_reducer
(
monkeypatch
):
...
@@ -24,7 +26,7 @@ def test_mp_reducer(monkeypatch):
...
@@ -24,7 +26,7 @@ def test_mp_reducer(monkeypatch):
with
patch
(
'multiprocessing.reducer.register'
)
as
mock_register
:
with
patch
(
'multiprocessing.reducer.register'
)
as
mock_register
:
engine_args
=
AsyncEngineArgs
(
engine_args
=
AsyncEngineArgs
(
model
=
"facebook/opt-125m"
,
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
max_model_len
=
32
,
max_model_len
=
32
,
gpu_memory_utilization
=
0.1
,
gpu_memory_utilization
=
0.1
,
disable_log_stats
=
True
,
disable_log_stats
=
True
,
...
...
tests/conftest.py
View file @
48a9e546
...
@@ -40,6 +40,7 @@ from vllm.sampling_params import BeamSearchParams
...
@@ -40,6 +40,7 @@ from vllm.sampling_params import BeamSearchParams
from
vllm.transformers_utils.utils
import
maybe_model_redirect
from
vllm.transformers_utils.utils
import
maybe_model_redirect
from
.utils
import
models_path_prefix
from
.utils
import
models_path_prefix
from
vllm.platforms
import
current_platform
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -783,7 +784,7 @@ class VllmRunner:
...
@@ -783,7 +784,7 @@ class VllmRunner:
dtype
:
str
=
"auto"
,
dtype
:
str
=
"auto"
,
disable_log_stats
:
bool
=
True
,
disable_log_stats
:
bool
=
True
,
tensor_parallel_size
:
int
=
1
,
tensor_parallel_size
:
int
=
1
,
block_size
:
int
=
16
,
block_size
:
int
=
16
if
not
current_platform
.
is_rocm
()
else
64
,
enable_chunked_prefill
:
Optional
[
bool
]
=
False
,
enable_chunked_prefill
:
Optional
[
bool
]
=
False
,
swap_space
:
int
=
4
,
swap_space
:
int
=
4
,
enforce_eager
:
Optional
[
bool
]
=
False
,
enforce_eager
:
Optional
[
bool
]
=
False
,
...
...
tests/core/block/e2e/test_correctness.py
View file @
48a9e546
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
from
itertools
import
cycle
from
itertools
import
cycle
import
pytest
import
pytest
...
@@ -8,10 +9,8 @@ import pytest
...
@@ -8,10 +9,8 @@ import pytest
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
.conftest
import
get_token_ids_from_llm_generator
from
.conftest
import
get_token_ids_from_llm_generator
import
os
from
....utils
import
models_path_prefix
from
....utils
import
models_path_prefix
import
vllm.envs
as
envs
from
vllm.platforms
import
current_platform
from
vllm.utils
import
SUPPORT_TC
,
gpuname
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -24,7 +23,7 @@ from vllm.utils import SUPPORT_TC, gpuname
...
@@ -24,7 +23,7 @@ from vllm.utils import SUPPORT_TC, gpuname
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# Allow only 5 sequences of ~1024 tokens in worst case.
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
...
@@ -107,7 +106,7 @@ def test_block_manager_with_preemption(baseline_llm_generator,
...
@@ -107,7 +106,7 @@ def test_block_manager_with_preemption(baseline_llm_generator,
"per_test_common_llm_kwargs"
,
"per_test_common_llm_kwargs"
,
[
[
{
{
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
# Allow only 2 sequences of ~128 tokens in worst case.
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 8 = 128/block_size
# Note 8 = 128/block_size
...
@@ -200,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
...
@@ -200,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
])
])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{
[{
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"max_num_batched_tokens"
:
2
,
"max_num_batched_tokens"
:
2
,
"max_num_seqs"
:
2
,
"max_num_seqs"
:
2
,
},
{
},
{
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"max_num_batched_tokens"
:
3
,
"max_num_batched_tokens"
:
3
,
"max_num_seqs"
:
2
,
"max_num_seqs"
:
2
,
},
{
},
{
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"max_num_batched_tokens"
:
256
,
"max_num_batched_tokens"
:
256
,
"max_num_seqs"
:
10
,
"max_num_seqs"
:
10
,
}])
}])
...
@@ -274,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
...
@@ -274,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# Allow only 5 sequences of ~1024 tokens in worst case.
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
# Enable prefill cache
# Enable prefill cache
...
@@ -355,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
...
@@ -355,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# Allow only 5 sequences of ~1024 tokens in worst case.
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
"num_gpu_blocks_override"
:
5
*
(
64
+
1
),
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
...
@@ -430,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
...
@@ -430,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
# we keep the blocks small, so that hit eviction quickly
# we keep the blocks small, so that hit eviction quickly
"max_model_len"
:
48
,
"max_model_len"
:
48
,
"block_size"
:
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
"block_size"
:
64
if
current_platform
.
is_rocm
()
else
16
,
"num_gpu_blocks_override"
:
3
,
"num_gpu_blocks_override"
:
3
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
48a9e546
...
@@ -15,8 +15,7 @@ from vllm.sequence import Logprob, SequenceGroup
...
@@ -15,8 +15,7 @@ from vllm.sequence import Logprob, SequenceGroup
from
.utils
import
create_dummy_prompt
from
.utils
import
create_dummy_prompt
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
from
vllm.platforms
import
current_platform
import
vllm.envs
as
envs
def
get_sequence_groups
(
scheduler_output
):
def
get_sequence_groups
(
scheduler_output
):
...
@@ -852,7 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str,
...
@@ -852,7 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str,
max_num_seqs
=
8
,
max_num_seqs
=
8
,
enable_chunked_prefill
=
True
,
enable_chunked_prefill
=
True
,
gpu_memory_utilization
=
0.8
,
gpu_memory_utilization
=
0.8
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
,
)
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
...
...
tests/core/test_num_computed_tokens_update.py
View file @
48a9e546
...
@@ -10,8 +10,6 @@ from vllm.engine.llm_engine import LLMEngine
...
@@ -10,8 +10,6 @@ from vllm.engine.llm_engine import LLMEngine
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SequenceGroup
from
vllm.sequence
import
SequenceGroup
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-160m"
)
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-160m"
)
...
@@ -41,7 +39,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
...
@@ -41,7 +39,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
num_scheduler_steps
=
num_scheduler_steps
,
num_scheduler_steps
=
num_scheduler_steps
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
)
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
)
engine
:
LLMEngine
=
runner
.
model
.
llm_engine
engine
:
LLMEngine
=
runner
.
model
.
llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step.
# In multi-step + chunked-prefill there is no separate single prompt step.
...
...
tests/core/test_scheduler.py
View file @
48a9e546
...
@@ -15,6 +15,7 @@ from vllm.core.interfaces import AllocStatus
...
@@ -15,6 +15,7 @@ from vllm.core.interfaces import AllocStatus
from
vllm.core.scheduler
import
Scheduler
,
SchedulingBudget
from
vllm.core.scheduler
import
Scheduler
,
SchedulingBudget
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
SequenceGroup
,
SequenceStatus
from
vllm.sequence
import
SequenceGroup
,
SequenceStatus
from
vllm.platforms
import
current_platform
from
.utils
import
(
append_new_token
,
append_new_token_seq
,
from
.utils
import
(
append_new_token
,
append_new_token_seq
,
append_new_token_seq_group
,
create_dummy_prompt
,
append_new_token_seq_group
,
create_dummy_prompt
,
...
@@ -22,7 +23,7 @@ from .utils import (append_new_token, append_new_token_seq,
...
@@ -22,7 +23,7 @@ from .utils import (append_new_token, append_new_token_seq,
def
test_scheduler_add_seq_group
():
def
test_scheduler_add_seq_group
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
"generate"
,
"generate"
,
max_num_batched_tokens
=
100
,
max_num_batched_tokens
=
100
,
...
@@ -45,7 +46,7 @@ def test_scheduler_add_seq_group():
...
@@ -45,7 +46,7 @@ def test_scheduler_add_seq_group():
def
test_scheduler_abort_seq_group
():
def
test_scheduler_abort_seq_group
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
"generate"
,
"generate"
,
max_num_batched_tokens
=
100
,
max_num_batched_tokens
=
100
,
...
@@ -72,7 +73,7 @@ def test_scheduler_abort_seq_group():
...
@@ -72,7 +73,7 @@ def test_scheduler_abort_seq_group():
def
test_scheduler_schedule_simple
():
def
test_scheduler_schedule_simple
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
num_seq_group
=
4
num_seq_group
=
4
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
...
@@ -117,7 +118,7 @@ def test_scheduler_schedule_simple():
...
@@ -117,7 +118,7 @@ def test_scheduler_schedule_simple():
def
test_scheduler_prefill_prioritized
():
def
test_scheduler_prefill_prioritized
():
"""Verify running batched tokens are not applied to prefill requests."""
"""Verify running batched tokens are not applied to prefill requests."""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
max_model_len
=
30
max_model_len
=
30
max_batched_num_tokens
=
30
max_batched_num_tokens
=
30
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
...
@@ -150,7 +151,7 @@ def test_scheduler_prefill_prioritized():
...
@@ -150,7 +151,7 @@ def test_scheduler_prefill_prioritized():
def
test_scheduler_schedule_preempt_abort
():
def
test_scheduler_schedule_preempt_abort
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
"generate"
,
"generate"
,
...
@@ -208,7 +209,7 @@ def test_scheduler_schedule_preempt_abort():
...
@@ -208,7 +209,7 @@ def test_scheduler_schedule_preempt_abort():
def
test_scheduler_max_seqs
():
def
test_scheduler_max_seqs
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
num_seq_group
=
4
num_seq_group
=
4
max_seq_group
=
2
max_seq_group
=
2
max_model_len
=
16
max_model_len
=
16
...
@@ -256,7 +257,7 @@ def test_scheduler_max_seqs():
...
@@ -256,7 +257,7 @@ def test_scheduler_max_seqs():
def
test_scheduler_delay_factor
():
def
test_scheduler_delay_factor
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
"generate"
,
"generate"
,
max_num_batched_tokens
=
100
,
max_num_batched_tokens
=
100
,
...
@@ -306,7 +307,7 @@ def initialize_scheduler(
...
@@ -306,7 +307,7 @@ def initialize_scheduler(
max_token_budget
=
1000
,
max_token_budget
=
1000
,
max_model_len
=
1000
,
max_model_len
=
1000
,
lora_config
=
None
,
lora_config
=
None
,
block_size
=
4
,
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
,
num_cpu_blocks
=
8
,
num_cpu_blocks
=
8
,
num_gpu_blocks
=
8
,
num_gpu_blocks
=
8
,
enable_prefix_caching
=
False
,
enable_prefix_caching
=
False
,
...
@@ -354,7 +355,7 @@ def test_prefill_schedule_max_prompt_len():
...
@@ -354,7 +355,7 @@ def test_prefill_schedule_max_prompt_len():
"""
"""
Test prompt longer than max_prompt_len is aborted.
Test prompt longer than max_prompt_len is aborted.
"""
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
max_model_len
=
30
,
block_size
=
block_size
)
scheduler
=
initialize_scheduler
(
max_model_len
=
30
,
block_size
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
60
,
prompt_length
=
60
,
...
@@ -374,7 +375,7 @@ def test_prefill_schedule_token_budget():
...
@@ -374,7 +375,7 @@ def test_prefill_schedule_token_budget():
"""
"""
Test token budget respected.
Test token budget respected.
"""
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
num_gpu_blocks
=
64
)
...
@@ -436,7 +437,7 @@ def test_prefill_schedule_max_seqs():
...
@@ -436,7 +437,7 @@ def test_prefill_schedule_max_seqs():
"""
"""
Test max seq respected.
Test max seq respected.
"""
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
num_gpu_blocks
=
64
)
...
@@ -475,7 +476,7 @@ def test_prefill_schedule_max_lora():
...
@@ -475,7 +476,7 @@ def test_prefill_schedule_max_lora():
"""
"""
Test max lora is respected and prioritized.
Test max lora is respected and prioritized.
"""
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
block_size
=
block_size
,
block_size
=
block_size
,
...
@@ -528,7 +529,7 @@ def test_prefill_schedule_no_block_manager_capacity():
...
@@ -528,7 +529,7 @@ def test_prefill_schedule_no_block_manager_capacity():
"""
"""
Test sequence cannot be scheduled due to block manager has no capacity.
Test sequence cannot be scheduled due to block manager has no capacity.
"""
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_gpu_blocks
=
128
,
num_gpu_blocks
=
128
,
num_cpu_blocks
=
128
)
num_cpu_blocks
=
128
)
...
@@ -570,7 +571,7 @@ def test_decode_schedule_preempted():
...
@@ -570,7 +571,7 @@ def test_decode_schedule_preempted():
"""
"""
Test decodes cannot be scheduled and preempted.
Test decodes cannot be scheduled and preempted.
"""
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
num_gpu_blocks
=
64
)
...
@@ -614,7 +615,7 @@ def test_schedule_decode_blocks_to_copy_update():
...
@@ -614,7 +615,7 @@ def test_schedule_decode_blocks_to_copy_update():
"""
"""
Verify blocks_to_copy is updated.
Verify blocks_to_copy is updated.
"""
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
4
,
scheduler
=
initialize_scheduler
(
block_size
=
4
,
num_cpu_blocks
=
16
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
)
num_gpu_blocks
=
16
)
...
@@ -646,7 +647,7 @@ def test_schedule_decode_blocks_to_copy_update():
...
@@ -646,7 +647,7 @@ def test_schedule_decode_blocks_to_copy_update():
def
test_schedule_swapped_max_loras
():
def
test_schedule_swapped_max_loras
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
block_size
=
block_size
,
block_size
=
block_size
,
...
@@ -679,7 +680,7 @@ def test_schedule_swapped_max_loras():
...
@@ -679,7 +680,7 @@ def test_schedule_swapped_max_loras():
def
test_schedule_swapped_cannot_swap_in
():
def
test_schedule_swapped_cannot_swap_in
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
num_gpu_blocks
=
32
)
...
@@ -709,7 +710,7 @@ def test_schedule_swapped_cannot_swap_in():
...
@@ -709,7 +710,7 @@ def test_schedule_swapped_cannot_swap_in():
def
test_infeasible_swap
():
def
test_infeasible_swap
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
num_gpu_blocks
=
32
)
...
@@ -740,7 +741,7 @@ def test_infeasible_swap():
...
@@ -740,7 +741,7 @@ def test_infeasible_swap():
def
test_schedule_swapped_blocks_to_copy
():
def
test_schedule_swapped_blocks_to_copy
():
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
num_gpu_blocks
=
32
)
...
@@ -825,7 +826,7 @@ def test_prefix_caching_aware_prefills(enable_prefix_caching):
...
@@ -825,7 +826,7 @@ def test_prefix_caching_aware_prefills(enable_prefix_caching):
considering prefix caching.
considering prefix caching.
"""
"""
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
max_num_batched_tokens
=
12
max_num_batched_tokens
=
12
max_seq_group
=
3
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
scheduler
=
initialize_scheduler
(
...
@@ -912,7 +913,7 @@ def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
...
@@ -912,7 +913,7 @@ def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
block-size aligned).
block-size aligned).
"""
"""
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_num_batched_tokens
=
4
max_num_batched_tokens
=
4
max_seq_group
=
3
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
scheduler
=
initialize_scheduler
(
...
@@ -978,7 +979,7 @@ def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
...
@@ -978,7 +979,7 @@ def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
Test that the scheduler does not schedule batches with prompt tokens and
Test that the scheduler does not schedule batches with prompt tokens and
prompt embeddings co-mingled.
prompt embeddings co-mingled.
"""
"""
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
...
@@ -1057,7 +1058,7 @@ def test_remove_seq_from_computed_blocks_tracker():
...
@@ -1057,7 +1058,7 @@ def test_remove_seq_from_computed_blocks_tracker():
_seq_id_to_num_tokens_computed.
_seq_id_to_num_tokens_computed.
"""
"""
# Budget can not schedule in swapped
# Budget can not schedule in swapped
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
max_seq_group
=
3
seq_tokens_with_swapped
:
list
[
list
[
int
]]
=
[]
seq_tokens_with_swapped
:
list
[
list
[
int
]]
=
[]
blocks_to_swap_out
:
list
[
tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
list
[
tuple
[
int
,
int
]]
=
[]
...
@@ -1097,7 +1098,7 @@ def test_remove_seq_from_computed_blocks_tracker():
...
@@ -1097,7 +1098,7 @@ def test_remove_seq_from_computed_blocks_tracker():
# Prefill schedule don't have a space for another LoRA, so
# Prefill schedule don't have a space for another LoRA, so
# we ignore this request for now.
# we ignore this request for now.
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
block_size
=
block_size
,
block_size
=
block_size
,
...
@@ -1131,7 +1132,7 @@ def test_remove_seq_from_computed_blocks_tracker():
...
@@ -1131,7 +1132,7 @@ def test_remove_seq_from_computed_blocks_tracker():
# Prefill scheduler does not schedule batches with prompt tokens and
# Prefill scheduler does not schedule batches with prompt tokens and
# prompt embeddings co-mingled.
# prompt embeddings co-mingled.
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
...
@@ -1170,7 +1171,7 @@ def test_remove_seq_from_computed_blocks_tracker():
...
@@ -1170,7 +1171,7 @@ def test_remove_seq_from_computed_blocks_tracker():
# Prefill scheduler budget num_batched_tokens
# Prefill scheduler budget num_batched_tokens
# >= scheduler_config max_num_batched_tokens
# >= scheduler_config max_num_batched_tokens
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
max_seq_group
=
3
seq_tokens_prefill_budget
:
list
[
list
[
int
]]
=
[]
seq_tokens_prefill_budget
:
list
[
list
[
int
]]
=
[]
...
@@ -1205,7 +1206,7 @@ def test_remove_seq_from_computed_blocks_tracker():
...
@@ -1205,7 +1206,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert
seq_id_to_num_tokens_computed
is
None
assert
seq_id_to_num_tokens_computed
is
None
# Budget can not schedule in waiting
# Budget can not schedule in waiting
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
scheduler
=
initialize_scheduler
(
...
@@ -1241,7 +1242,7 @@ def test_remove_seq_from_computed_blocks_tracker():
...
@@ -1241,7 +1242,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert
seq_id_to_num_tokens_computed
is
None
assert
seq_id_to_num_tokens_computed
is
None
# Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED
# Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
...
@@ -1269,7 +1270,7 @@ def test_remove_seq_from_computed_blocks_tracker():
...
@@ -1269,7 +1270,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert
seq_id_to_num_tokens_computed
is
None
assert
seq_id_to_num_tokens_computed
is
None
# Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED
# Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
...
@@ -1303,7 +1304,7 @@ def test_remove_seq_from_computed_blocks_tracker():
...
@@ -1303,7 +1304,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert
seq_id_to_num_tokens_computed
is
None
assert
seq_id_to_num_tokens_computed
is
None
# Budget can not allocate, AllocStatus is LATER
# Budget can not allocate, AllocStatus is LATER
block_size
=
2
block_size
=
2
if
not
current_platform
.
is_rocm
()
else
64
max_seq_group
=
3
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
block_size
=
block_size
,
...
...
tests/core/test_scheduler_encoder_decoder.py
View file @
48a9e546
...
@@ -6,6 +6,7 @@ import pytest # noqa
...
@@ -6,6 +6,7 @@ import pytest # noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.core.scheduler
import
Scheduler
from
vllm.core.scheduler
import
Scheduler
from
vllm.sequence
import
SequenceGroup
from
vllm.sequence
import
SequenceGroup
from
vllm.platforms
import
current_platform
from
.utils
import
(
append_new_token
,
create_dummy_prompt_encoder_decoder
,
from
.utils
import
(
append_new_token
,
create_dummy_prompt_encoder_decoder
,
get_sequence_groups
,
schedule_and_update_computed_tokens
)
get_sequence_groups
,
schedule_and_update_computed_tokens
)
...
@@ -34,7 +35,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
...
@@ -34,7 +35,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
cross-attention block table
cross-attention block table
'''
'''
block_size
=
4
block_size
=
4
if
not
current_platform
.
is_rocm
()
else
64
num_seq_group
=
4
num_seq_group
=
4
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
...
...
tests/detokenizer/test_disable_detokenization.py
View file @
48a9e546
...
@@ -7,8 +7,7 @@ import pytest
...
@@ -7,8 +7,7 @@ import pytest
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.llm
import
LLM
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
import
vllm.envs
as
envs
from
vllm.platforms
import
current_platform
from
vllm.utils
import
SUPPORT_TC
,
gpuname
@
pytest
.
mark
.
skip_v1
@
pytest
.
mark
.
skip_v1
...
@@ -23,7 +22,7 @@ def test_computed_prefix_blocks(model: str):
...
@@ -23,7 +22,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available "
"paper clips? Is there an easy to follow video tutorial available "
"online for free?"
)
"online for free?"
)
llm
=
LLM
(
model
=
model
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
)
llm
=
LLM
(
model
=
model
,
block_size
=
64
if
current_platform
.
is_rocm
()
else
16
)
sampling_params
=
SamplingParams
(
max_tokens
=
10
,
sampling_params
=
SamplingParams
(
max_tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
detokenize
=
False
)
detokenize
=
False
)
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment