Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
823ab796
Unverified
Commit
823ab796
authored
Jan 28, 2025
by
Harry Mellor
Committed by
GitHub
Jan 27, 2025
Browse files
Update `pre-commit` hooks (#12475)
Signed-off-by:
Harry Mellor
<
19981378+hmellor@users.noreply.github.com
>
parent
6116ca8c
Changes
64
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
165 additions
and
145 deletions
+165
-145
.pre-commit-config.yaml
.pre-commit-config.yaml
+5
-5
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+2
-2
csrc/custom_all_reduce.cuh
csrc/custom_all_reduce.cuh
+6
-2
csrc/moe/marlin_kernels/marlin_moe_kernel.h
csrc/moe/marlin_kernels/marlin_moe_kernel.h
+4
-4
csrc/quantization/gptq_marlin/gptq_marlin.cu
csrc/quantization/gptq_marlin/gptq_marlin.cu
+8
-8
csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+2
-2
csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+2
-2
csrc/quantization/marlin/sparse/common/mma.h
csrc/quantization/marlin/sparse/common/mma.h
+2
-2
csrc/rocm/attention.cu
csrc/rocm/attention.cu
+3
-1
setup.py
setup.py
+1
-1
tests/kernels/test_block_fp8.py
tests/kernels/test_block_fp8.py
+14
-11
tests/kv_transfer/test_lookup_buffer.py
tests/kv_transfer/test_lookup_buffer.py
+5
-5
tests/lora/test_qwen2vl.py
tests/lora/test_qwen2vl.py
+3
-3
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+70
-60
tests/models/decoder_only/vision_language/test_pixtral.py
tests/models/decoder_only/vision_language/test_pixtral.py
+8
-9
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+3
-3
tests/samplers/test_rejection_sampler.py
tests/samplers/test_rejection_sampler.py
+8
-7
tools/report_build_time_ninja.py
tools/report_build_time_ninja.py
+3
-2
vllm/_custom_ops.py
vllm/_custom_ops.py
+2
-2
vllm/attention/ops/prefix_prefill.py
vllm/attention/ops/prefix_prefill.py
+14
-14
No files found.
.pre-commit-config.yaml
View file @
823ab796
...
...
@@ -3,18 +3,18 @@ default_stages:
-
manual
# Run in CI
repos
:
-
repo
:
https://github.com/google/yapf
rev
:
v0.3
2
.0
rev
:
v0.
4
3.0
hooks
:
-
id
:
yapf
args
:
[
--in-place
,
--verbose
]
additional_dependencies
:
[
toml
]
# TODO: Remove when yapf is upgraded
-
repo
:
https://github.com/astral-sh/ruff-pre-commit
rev
:
v0.
6.5
rev
:
v0.
9.3
hooks
:
-
id
:
ruff
args
:
[
--output-format
,
github
]
-
repo
:
https://github.com/codespell-project/codespell
rev
:
v2.
3
.0
rev
:
v2.
4
.0
hooks
:
-
id
:
codespell
exclude
:
'
benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
...
...
@@ -23,7 +23,7 @@ repos:
hooks
:
-
id
:
isort
-
repo
:
https://github.com/pre-commit/mirrors-clang-format
rev
:
v1
8
.1.
5
rev
:
v1
9
.1.
7
hooks
:
-
id
:
clang-format
exclude
:
'
csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
...
...
@@ -35,7 +35,7 @@ repos:
-
id
:
pymarkdown
files
:
docs/.*
-
repo
:
https://github.com/rhysd/actionlint
rev
:
v1.7.
6
rev
:
v1.7.
7
hooks
:
-
id
:
actionlint
-
repo
:
local
...
...
benchmarks/benchmark_serving.py
View file @
823ab796
...
...
@@ -926,8 +926,8 @@ def main(args: argparse.Namespace):
)
# Traffic
result_json
[
"request_rate"
]
=
(
args
.
request_rate
if
args
.
request_rate
<
float
(
"inf"
)
else
"inf"
)
result_json
[
"request_rate"
]
=
(
args
.
request_rate
if
args
.
request_rate
<
float
(
"inf"
)
else
"inf"
)
result_json
[
"burstiness"
]
=
args
.
burstiness
result_json
[
"max_concurrency"
]
=
args
.
max_concurrency
...
...
csrc/custom_all_reduce.cuh
View file @
823ab796
...
...
@@ -38,9 +38,13 @@ struct Signal {
alignas
(
128
)
FlagType
peer_counter
[
2
][
kMaxBlocks
][
8
];
};
struct
__align__
(
16
)
RankData
{
const
void
*
__restrict__
ptrs
[
8
];
};
struct
__align__
(
16
)
RankData
{
const
void
*
__restrict__
ptrs
[
8
];
};
struct
__align__
(
16
)
RankSignals
{
Signal
*
signals
[
8
];
};
struct
__align__
(
16
)
RankSignals
{
Signal
*
signals
[
8
];
};
// like std::array, but aligned
template
<
typename
T
,
int
sz
>
...
...
csrc/moe/marlin_kernels/marlin_moe_kernel.h
View file @
823ab796
...
...
@@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
const
int
HI
=
0x00f000f0
;
const
int
EX
=
0x64006400
;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const
int
SUB
=
0x64086408
;
...
...
@@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
const
int
HI
=
0x00f000f0
;
const
int
EX
=
0x64006400
;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
const
int
SUB
=
0x64006400
;
const
int
MUL
=
0x2c002c00
;
...
...
csrc/quantization/gptq_marlin/gptq_marlin.cu
View file @
823ab796
...
...
@@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
const
int
HI
=
0x00f000f0
;
const
int
EX
=
0x64006400
;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const
int
SUB
=
0x64086408
;
...
...
@@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
// Guarantee that the `(a & b) | c` operations are LOP3s.
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
MASK
,
EX
);
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
MASK
,
EX
);
q
>>=
4
;
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
MASK
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
MASK
,
EX
);
typename
ScalarType
<
nv_bfloat16
>::
FragB
frag_b
;
static
constexpr
uint32_t
MUL
=
0x3F803F80
;
...
...
@@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
const
int
HI
=
0x00f000f0
;
const
int
EX
=
0x64006400
;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
const
int
SUB
=
0x64006400
;
const
int
MUL
=
0x2c002c00
;
...
...
@@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
// Guarantee that the `(a & b) | c` operations are LOP3s.
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
MASK
,
EX
);
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
MASK
,
EX
);
q
>>=
4
;
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
MASK
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
MASK
,
EX
);
typename
ScalarType
<
nv_bfloat16
>::
FragB
frag_b
;
static
constexpr
uint32_t
MUL
=
0x3F803F80
;
...
...
csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
View file @
823ab796
...
...
@@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
const
int
HI
=
0x00f000f0
;
const
int
EX
=
0x64006400
;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const
int
SUB
=
0x64086408
;
...
...
csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
View file @
823ab796
...
...
@@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
static
constexpr
uint32_t
HI
=
0x00f000f0
;
static
constexpr
uint32_t
EX
=
0x64006400
;
// Guarantee that the `(a & b) | c` operations are LOP3s.
uint32_t
t0
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
uint32_t
t1
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
uint32_t
t0
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
uint32_t
t1
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
static
constexpr
uint32_t
SUB
=
0x64086408
;
...
...
csrc/quantization/marlin/sparse/common/mma.h
View file @
823ab796
...
...
@@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
const
int
HI
=
0x00f000f0
;
const
int
EX
=
0x64006400
;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const
int
SUB
=
0x64086408
;
...
...
csrc/rocm/attention.cu
View file @
823ab796
...
...
@@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const
scalar_t
*
__restrict__
tmp_out
,
// [num_seqs, num_heads,
// max_num_partitions, head_size]
const
int
*
__restrict__
context_lens
,
// [num_seqs]
const
int
max_num_partitions
){
UNREACHABLE_CODE
}
const
int
max_num_partitions
)
{
UNREACHABLE_CODE
}
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
...
...
setup.py
View file @
823ab796
...
...
@@ -417,7 +417,7 @@ def get_rocm_version():
if
(
get_rocm_core_version
(
ctypes
.
byref
(
major
),
ctypes
.
byref
(
minor
),
ctypes
.
byref
(
patch
))
==
0
):
return
"%d.%d.%d"
%
(
major
.
value
,
minor
.
value
,
patch
.
value
)
return
f
"
{
major
.
value
}
.
{
minor
.
value
}
.
{
patch
.
value
}
"
return
None
except
Exception
:
return
None
...
...
tests/kernels/test_block_fp8.py
View file @
823ab796
...
...
@@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A,
A
[:,
i
*
block_k
:
min
((
i
+
1
)
*
block_k
,
K
)]
for
i
in
range
(
k_tiles
)
]
B_tiles
=
[[
B
[
j
*
block_n
:
min
((
j
+
1
)
*
block_n
,
N
),
i
*
block_k
:
min
((
i
+
1
)
*
block_k
,
K
),
]
for
i
in
range
(
k_tiles
)
B
[
j
*
block_n
:
min
((
j
+
1
)
*
block_n
,
N
),
i
*
block_k
:
min
((
i
+
1
)
*
block_k
,
K
),
]
for
i
in
range
(
k_tiles
)
]
for
j
in
range
(
n_tiles
)]
C_tiles
=
[
C
[:,
j
*
block_n
:
min
((
j
+
1
)
*
block_n
,
N
)]
for
j
in
range
(
n_tiles
)
...
...
@@ -157,9 +159,9 @@ def setup_cuda():
torch
.
set_default_device
(
"cuda"
)
@
pytest
.
mark
.
parametrize
(
"num_tokens,d,dtype,group_size,seed"
,
itertools
.
product
(
NUM_TOKENS
,
D
,
DTYPES
,
GROUP_SIZE
,
SEEDS
))
@
pytest
.
mark
.
parametrize
(
"num_tokens,d,dtype,group_size,seed"
,
itertools
.
product
(
NUM_TOKENS
,
D
,
DTYPES
,
GROUP_SIZE
,
SEEDS
))
@
torch
.
inference_mode
()
def
test_per_token_group_quant_fp8
(
num_tokens
,
d
,
dtype
,
group_size
,
seed
):
torch
.
manual_seed
(
seed
)
...
...
@@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
assert
torch
.
allclose
(
scale
,
ref_scale
)
@
pytest
.
mark
.
parametrize
(
"M,N,K,block_size,out_dtype,seed"
,
itertools
.
product
(
M
,
N
,
K
,
BLOCK_SIZE
,
OUT_DTYPES
,
SEEDS
))
@
pytest
.
mark
.
parametrize
(
"
M,N,K,
block_size,out_dtype,seed"
,
itertools
.
product
(
M
,
N
,
K
,
BLOCK_SIZE
,
OUT_DTYPES
,
SEEDS
))
@
torch
.
inference_mode
()
def
test_w8a8_block_fp8_matmul
(
M
,
N
,
K
,
block_size
,
out_dtype
,
seed
):
torch
.
manual_seed
(
seed
)
...
...
@@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
assert
rel_diff
<
0.001
@
pytest
.
mark
.
parametrize
(
"M,N,K,E,topk,block_size,dtype,seed"
,
itertools
.
product
(
M_moe
,
N_moe
,
K_moe
,
E
,
TOP_KS
,
BLOCK_SIZE
,
DTYPES
,
SEEDS
))
@
pytest
.
mark
.
parametrize
(
"M,N,K,E,topk,block_size,dtype,seed"
,
itertools
.
product
(
M_moe
,
N_moe
,
K_moe
,
E
,
TOP_KS
,
BLOCK_SIZE
,
DTYPES
,
SEEDS
))
@
torch
.
inference_mode
()
def
test_w8a8_block_fp8_fused_moe
(
M
,
N
,
K
,
E
,
topk
,
block_size
,
dtype
,
seed
):
torch
.
manual_seed
(
seed
)
...
...
tests/kv_transfer/test_lookup_buffer.py
View file @
823ab796
...
...
@@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device):
assert
buffer
.
buffer_size
==
0
assert
len
(
buffer
.
buffer
)
==
0
print
(
"My rank:
%d
, device:
%s"
%
(
my_rank
,
device
)
)
print
(
f
"My rank:
{
my_rank
}
, device:
{
device
}
"
)
# insert
tokens
=
torch
.
tensor
([
1
,
2
,
3
]).
to
(
device
)
...
...
@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
assert
buffer
.
buffer_size
==
0
assert
len
(
buffer
.
buffer
)
==
0
print
(
"My rank:
%d
, Test run passed!"
%
(
my_rank
)
)
print
(
f
"My rank:
{
my_rank
}
, Test run passed!"
)
def
stress_test
(
my_rank
,
buf
,
device
):
...
...
@@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device):
assert
torch
.
allclose
(
k
,
k_
)
assert
torch
.
allclose
(
v
,
v_
)
assert
torch
.
allclose
(
h
,
h_
)
print
(
'
Rank
%d done'
%
my_rank
)
print
(
f
"
Rank
{
my_rank
}
done"
)
torch
.
distributed
.
barrier
()
if
my_rank
==
0
:
...
...
@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
else
:
torch
.
distributed
.
send
(
torch
.
tensor
([
n
]),
0
)
print
(
"My rank:
%d
, Passed stress test!"
%
(
my_rank
)
)
print
(
f
"My rank:
{
my_rank
}
, Passed stress test!"
)
if
__name__
==
"__main__"
:
...
...
@@ -122,7 +122,7 @@ if __name__ == "__main__":
rank
=
my_rank
,
)
print
(
"initialized! My rank is
%d"
%
my_rank
)
print
(
f
"initialized! My rank is
{
my_rank
}
"
)
config
=
KVTransferConfig
(
kv_connector
=
'PyNcclConnector'
,
...
...
tests/lora/test_qwen2vl.py
View file @
823ab796
...
...
@@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return
generated_texts
@
pytest
.
mark
.
xfail
(
current_platform
.
is_rocm
(),
reason
=
"Qwen2-VL dependency xformers incompatible with ROCm"
)
@
pytest
.
mark
.
xfail
(
current_platform
.
is_rocm
(),
reason
=
"Qwen2-VL dependency xformers incompatible with ROCm"
)
def
test_qwen2vl_lora
(
qwen2vl_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
...
...
tests/models/decoder_only/vision_language/test_models.py
View file @
823ab796
...
...
@@ -521,12 +521,13 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
# - image embeddings
# - video
# - custom inputs
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
IMAGE
,
fork_new_process_for_each_test
=
False
,
))
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
IMAGE
,
fork_new_process_for_each_test
=
False
,
))
def
test_single_image_models
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
...
...
@@ -543,12 +544,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
MULTI_IMAGE
,
fork_new_process_for_each_test
=
False
,
))
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
MULTI_IMAGE
,
fork_new_process_for_each_test
=
False
,
))
def
test_multi_image_models
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
...
...
@@ -565,12 +567,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
EMBEDDING
,
fork_new_process_for_each_test
=
False
,
))
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
EMBEDDING
,
fork_new_process_for_each_test
=
False
,
))
def
test_image_embedding_models
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
...
...
@@ -586,12 +589,13 @@ def test_image_embedding_models(model_type: str,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
VIDEO
,
fork_new_process_for_each_test
=
False
,
))
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
VIDEO
,
fork_new_process_for_each_test
=
False
,
))
def
test_video_models
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
video_assets
:
_VideoAssets
):
...
...
@@ -605,12 +609,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
fork_new_process_for_each_test
=
False
,
))
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
fork_new_process_for_each_test
=
False
,
))
def
test_custom_inputs_models
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
...
...
@@ -627,12 +632,13 @@ def test_custom_inputs_models(
#### Tests filtering for things running each test as a new process
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
IMAGE
,
fork_new_process_for_each_test
=
True
,
))
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
IMAGE
,
fork_new_process_for_each_test
=
True
,
))
@
fork_new_process_for_each_test
def
test_single_image_models_heavy
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
...
...
@@ -650,12 +656,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
MULTI_IMAGE
,
fork_new_process_for_each_test
=
True
,
))
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
MULTI_IMAGE
,
fork_new_process_for_each_test
=
True
,
))
@
fork_new_process_for_each_test
def
test_multi_image_models_heavy
(
tmp_path
:
PosixPath
,
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
...
...
@@ -673,12 +680,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
EMBEDDING
,
fork_new_process_for_each_test
=
True
,
))
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
EMBEDDING
,
fork_new_process_for_each_test
=
True
,
))
@
fork_new_process_for_each_test
def
test_image_embedding_models_heavy
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
...
...
@@ -695,12 +703,13 @@ def test_image_embedding_models_heavy(model_type: str,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
VIDEO
,
fork_new_process_for_each_test
=
True
,
))
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
VIDEO
,
fork_new_process_for_each_test
=
True
,
))
def
test_video_models_heavy
(
model_type
:
str
,
test_case
:
ExpandableVLMTestArgs
,
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
...
...
@@ -715,12 +724,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
)
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
fork_new_process_for_each_test
=
True
,
))
@
pytest
.
mark
.
parametrize
(
"model_type,test_case"
,
get_parametrized_options
(
VLM_TEST_SETTINGS
,
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
fork_new_process_for_each_test
=
True
,
))
@
fork_new_process_for_each_test
def
test_custom_inputs_models_heavy
(
model_type
:
str
,
...
...
tests/models/decoder_only/vision_language/test_pixtral.py
View file @
823ab796
...
...
@@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs(
outputs
:
OutputsLogprobs
,
filename
:
"StrPath"
,
)
->
None
:
json_data
=
[(
tokens
,
text
,
[{
k
:
asdict
(
v
)
for
k
,
v
in
token_logprobs
.
items
()
}
for
token_logprobs
in
(
logprobs
or
[])])
json_data
=
[(
tokens
,
text
,
[{
k
:
asdict
(
v
)
for
k
,
v
in
token_logprobs
.
items
()
}
for
token_logprobs
in
(
logprobs
or
[])])
for
tokens
,
text
,
logprobs
in
outputs
]
with
open
(
filename
,
"w"
)
as
f
:
...
...
@@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
with
open
(
filename
,
"rb"
)
as
f
:
json_data
=
json
.
load
(
f
)
return
[(
tokens
,
text
,
[{
int
(
k
):
Logprob
(
**
v
)
for
k
,
v
in
token_logprobs
.
items
()}
for
token_logprobs
in
logprobs
])
for
tokens
,
text
,
logprobs
in
json_data
]
return
[(
tokens
,
text
,
[{
int
(
k
):
Logprob
(
**
v
)
for
k
,
v
in
token_logprobs
.
items
()
}
for
token_logprobs
in
logprobs
])
for
tokens
,
text
,
logprobs
in
json_data
]
@
large_gpu_test
(
min_gb
=
80
)
...
...
tests/quantization/test_compressed_tensors.py
View file @
823ab796
...
...
@@ -314,9 +314,9 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
@
pytest
.
mark
.
skip
(
reason
=
"2of4 sparse w16a16 CUTLASS produces bad output."
)
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
reason
=
"2of4 Sparse is not yet supported on this GPU type."
)
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
reason
=
"2of4 Sparse is not yet supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"args_2of4"
,
[(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor"
)])
...
...
tests/samplers/test_rejection_sampler.py
View file @
823ab796
...
...
@@ -23,16 +23,17 @@ def mock_causal_accepted_tensor(
"""
batch_size
=
last_accepted_indices
.
shape
[
0
]
accepted
=
(
torch
.
arange
(
k
).
expand
(
batch_size
,
k
)
<=
last_accepted_indices
.
unsqueeze
(
-
1
).
broadcast_to
(
accepted
=
(
torch
.
arange
(
k
).
expand
(
batch_size
,
k
)
<=
last_accepted_indices
.
unsqueeze
(
-
1
).
broadcast_to
(
batch_size
,
k
))
# Sprinkle accepted values after the contiguous initial accepted values.
# This replicates the behavior of rejection sampling, which may "accept"
# a token that cannot be accepted because of causality.
sprinkle_candidates
=
(
torch
.
arange
(
k
).
expand
(
batch_size
,
k
)
>
last_accepted_indices
.
unsqueeze
(
-
1
).
broadcast_to
(
batch_size
,
k
)
+
1
)
sprinkle_candidates
=
(
torch
.
arange
(
k
).
expand
(
batch_size
,
k
)
>
last_accepted_indices
.
unsqueeze
(
-
1
).
broadcast_to
(
batch_size
,
k
)
+
1
)
sprinkle
=
torch
.
rand
(
batch_size
,
k
)
>
0.5
accepted
[
sprinkle_candidates
]
=
sprinkle
[
sprinkle_candidates
]
return
accepted
...
...
@@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution(
distance_wrt_reference
)
expected_improvement_multiplier
=
20
assert
(
relative_change_in_distance_wrt_target
>
relative_change_in_distance_wrt_reference
*
assert
(
relative_change_in_distance_wrt_target
>
relative_change_in_distance_wrt_reference
*
expected_improvement_multiplier
)
...
...
tools/report_build_time_ninja.py
View file @
823ab796
...
...
@@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types):
print
(
' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
'parallelism)'
.
format
(
length
,
total_cpu_time
,
total_cpu_time
*
1.0
/
length
))
print
(
' %d build steps completed, average of %1.2f/s'
%
(
len
(
entries
),
len
(
entries
)
/
(
length
)))
print
(
' {} build steps completed, average of {:1.2f}/s'
.
format
(
len
(
entries
),
len
(
entries
)
/
(
length
)))
def
main
():
...
...
vllm/_custom_ops.py
View file @
823ab796
...
...
@@ -820,8 +820,8 @@ def scaled_int8_quant(
if
scale
is
not
None
:
# static-per-tensor quantization.
assert
symmetric
==
(
azp
is
None
),
"azp must only be provided for asymmetric quantization."
azp
is
None
),
"azp must only be provided for asymmetric quantization."
torch
.
ops
.
_C
.
static_scaled_int8_quant
(
output
,
input
,
scale
,
azp
)
return
output
,
scale
,
azp
...
...
vllm/attention/ops/prefix_prefill.py
View file @
823ab796
...
...
@@ -219,8 +219,8 @@ if triton.__version__ >= "2.1.0":
float
(
"-inf"
))
if
SLIDING_WINDOW
>
0
:
qk
=
tl
.
where
(
offs_m
[:,
None
]
-
(
start_n
+
offs_n
[
None
,
:])
<
SLIDING_WINDOW
,
qk
,
-
10000
)
offs_m
[:,
None
]
-
(
start_n
+
offs_n
[
None
,
:])
<
SLIDING_WINDOW
,
qk
,
-
10000
)
# -- compute m_ij, p, l_ij
m_ij
=
tl
.
max
(
qk
,
1
)
...
...
@@ -324,10 +324,10 @@ if triton.__version__ >= "2.1.0":
(
cur_batch_in_all_start_index
+
offs_m
[:,
None
])
*
stride_qbs
+
cur_head
*
stride_qh
+
offs_d
[
None
,
:]
*
stride_qd
)
q
=
tl
.
load
(
Q
+
off_q
,
mask
=
offs_m
[:,
None
]
<
cur_batch_seq_len
-
cur_batch_ctx_len
,
other
=
0.0
)
q
=
tl
.
load
(
Q
+
off_q
,
mask
=
offs_m
[:,
None
]
<
cur_batch_seq_len
-
cur_batch_ctx_len
,
other
=
0.0
)
# # initialize pointer to m and l
m_i
=
tl
.
zeros
([
BLOCK_M
],
dtype
=
tl
.
float32
)
-
float
(
"inf"
)
...
...
@@ -402,8 +402,8 @@ if triton.__version__ >= "2.1.0":
# -- compute qk ----
k
=
tl
.
load
(
k_ptrs
+
(
cur_batch_in_all_start_index
+
start_n
)
*
stride_kbs
,
mask
=
(
start_n
+
offs_n
[
None
,
:])
<
cur_batch_seq_len
-
cur_batch_ctx_len
,
mask
=
(
start_n
+
offs_n
[
None
,
:])
<
cur_batch_seq_len
-
cur_batch_ctx_len
,
other
=
0.0
)
qk
=
tl
.
zeros
([
BLOCK_M
,
BLOCK_N
],
dtype
=
tl
.
float32
)
...
...
@@ -430,8 +430,8 @@ if triton.__version__ >= "2.1.0":
# update acc
v
=
tl
.
load
(
v_ptrs
+
(
cur_batch_in_all_start_index
+
start_n
)
*
stride_vbs
,
mask
=
(
start_n
+
offs_n
[:,
None
])
<
cur_batch_seq_len
-
cur_batch_ctx_len
,
mask
=
(
start_n
+
offs_n
[:,
None
])
<
cur_batch_seq_len
-
cur_batch_ctx_len
,
other
=
0.0
)
p
=
p
.
to
(
v
.
dtype
)
...
...
@@ -639,8 +639,8 @@ if triton.__version__ >= "2.1.0":
k
=
tl
.
load
(
k_ptrs
+
(
cur_batch_in_all_start_index
+
start_n
)
*
stride_kbs
,
mask
=
dim_mask
[:,
None
]
&
((
start_n
+
offs_n
[
None
,
:])
<
cur_batch_seq_len
-
cur_batch_ctx_len
),
((
start_n
+
offs_n
[
None
,
:])
<
cur_batch_seq_len
-
cur_batch_ctx_len
),
other
=
0.0
)
qk
=
tl
.
zeros
([
BLOCK_M
,
BLOCK_N
],
dtype
=
tl
.
float32
)
...
...
@@ -677,8 +677,8 @@ if triton.__version__ >= "2.1.0":
v
=
tl
.
load
(
v_ptrs
+
(
cur_batch_in_all_start_index
+
start_n
)
*
stride_vbs
,
mask
=
dim_mask
[
None
,
:]
&
((
start_n
+
offs_n
[:,
None
])
<
cur_batch_seq_len
-
cur_batch_ctx_len
),
((
start_n
+
offs_n
[:,
None
])
<
cur_batch_seq_len
-
cur_batch_ctx_len
),
other
=
0.0
)
p
=
p
.
to
(
v
.
dtype
)
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment