Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ec5e299c
Commit
ec5e299c
authored
Feb 21, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.3' into v0.7.3-dev
parents
47bd229c
ed6e9075
Changes
521
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1161 additions
and
58 deletions
+1161
-58
tests/entrypoints/offline_mode/test_offline_mode.py
tests/entrypoints/offline_mode/test_offline_mode.py
+21
-0
tests/entrypoints/openai/correctness/__init__.py
tests/entrypoints/openai/correctness/__init__.py
+0
-0
tests/entrypoints/openai/correctness/test_lmeval.py
tests/entrypoints/openai/correctness/test_lmeval.py
+12
-5
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
.../openai/correctness/test_transcription_api_correctness.py
+166
-0
tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
...nai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+92
-20
tests/entrypoints/openai/test_audio.py
tests/entrypoints/openai/test_audio.py
+4
-7
tests/entrypoints/openai/test_basic.py
tests/entrypoints/openai/test_basic.py
+16
-0
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+43
-3
tests/entrypoints/openai/test_rerank.py
tests/entrypoints/openai/test_rerank.py
+1
-1
tests/entrypoints/openai/test_serving_models.py
tests/entrypoints/openai/test_serving_models.py
+1
-1
tests/entrypoints/openai/test_shutdown.py
tests/entrypoints/openai/test_shutdown.py
+2
-1
tests/entrypoints/openai/test_sleep.py
tests/entrypoints/openai/test_sleep.py
+32
-0
tests/entrypoints/openai/test_transcription_validation.py
tests/entrypoints/openai/test_transcription_validation.py
+122
-0
tests/entrypoints/openai/test_vision.py
tests/entrypoints/openai/test_vision.py
+2
-2
tests/entrypoints/openai/test_vision_embedding.py
tests/entrypoints/openai/test_vision_embedding.py
+2
-2
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_chat_utils.py
+1
-1
tests/kernels/test_cutlass_2of4_sparse.py
tests/kernels/test_cutlass_2of4_sparse.py
+66
-15
tests/kernels/test_mamba_mixer2.py
tests/kernels/test_mamba_mixer2.py
+125
-0
tests/kernels/test_mamba_ssm_ssd.py
tests/kernels/test_mamba_ssm_ssd.py
+304
-0
tests/kernels/test_nvfp4_quant.py
tests/kernels/test_nvfp4_quant.py
+149
-0
No files found.
Too many changes to show.
To preserve performance only
521 of 521+
files are displayed.
Plain diff
Email patch
tests/entrypoints/offline_mode/test_offline_mode.py
View file @
ec5e299c
...
@@ -5,6 +5,7 @@ import sys
...
@@ -5,6 +5,7 @@ import sys
import
os
import
os
import
pytest
import
pytest
import
urllib3
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
...
@@ -31,6 +32,15 @@ MODEL_CONFIGS = [
...
@@ -31,6 +32,15 @@ MODEL_CONFIGS = [
"tensor_parallel_size"
:
1
,
"tensor_parallel_size"
:
1
,
"tokenizer_mode"
:
"mistral"
,
"tokenizer_mode"
:
"mistral"
,
},
},
{
"model"
:
"sentence-transformers/all-MiniLM-L12-v2"
,
"enforce_eager"
:
True
,
"gpu_memory_utilization"
:
0.20
,
"max_model_len"
:
64
,
"max_num_batched_tokens"
:
64
,
"max_num_seqs"
:
64
,
"tensor_parallel_size"
:
1
,
},
]
]
...
@@ -50,6 +60,16 @@ def test_offline_mode(monkeypatch):
...
@@ -50,6 +60,16 @@ def test_offline_mode(monkeypatch):
# Set HF to offline mode and ensure we can still construct an LLM
# Set HF to offline mode and ensure we can still construct an LLM
try
:
try
:
monkeypatch
.
setenv
(
"HF_HUB_OFFLINE"
,
"1"
)
monkeypatch
.
setenv
(
"HF_HUB_OFFLINE"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_NO_USAGE_STATS"
,
"1"
)
def
disable_connect
(
*
args
,
**
kwargs
):
raise
RuntimeError
(
"No http calls allowed"
)
monkeypatch
.
setattr
(
urllib3
.
connection
.
HTTPConnection
,
"connect"
,
disable_connect
)
monkeypatch
.
setattr
(
urllib3
.
connection
.
HTTPSConnection
,
"connect"
,
disable_connect
)
# Need to re-import huggingface_hub and friends to setup offline mode
# Need to re-import huggingface_hub and friends to setup offline mode
_re_import_modules
()
_re_import_modules
()
# Cached model files should be used in offline mode
# Cached model files should be used in offline mode
...
@@ -59,6 +79,7 @@ def test_offline_mode(monkeypatch):
...
@@ -59,6 +79,7 @@ def test_offline_mode(monkeypatch):
# Reset the environment after the test
# Reset the environment after the test
# NB: Assuming tests are run in online mode
# NB: Assuming tests are run in online mode
monkeypatch
.
delenv
(
"HF_HUB_OFFLINE"
)
monkeypatch
.
delenv
(
"HF_HUB_OFFLINE"
)
monkeypatch
.
delenv
(
"VLLM_NO_USAGE_STATS"
)
_re_import_modules
()
_re_import_modules
()
pass
pass
...
...
tests/
runai_model_streamer
/__init__.py
→
tests/
entrypoints/openai/correctness
/__init__.py
View file @
ec5e299c
File moved
tests/entrypoints/openai/
test_accuracy
.py
→
tests/entrypoints/openai/
correctness/test_lmeval
.py
View file @
ec5e299c
...
@@ -14,7 +14,7 @@ import os
...
@@ -14,7 +14,7 @@ import os
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
from
...
.
utils
import
RemoteOpenAIServer
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
)
NUM_CONCURRENT
=
500
NUM_CONCURRENT
=
500
...
@@ -22,7 +22,7 @@ TASK = "gsm8k"
...
@@ -22,7 +22,7 @@ TASK = "gsm8k"
FILTER
=
"exact_match,strict-match"
FILTER
=
"exact_match,strict-match"
RTOL
=
0.03
RTOL
=
0.03
EXPECTED_VALUE
=
0.58
EXPECTED_VALUE
=
0.58
DEFAULT_ARGS
=
[
"--max-model-len"
,
"
2048
"
,
"--disable-log-requests"
]
DEFAULT_ARGS
=
[
"--max-model-len"
,
"
4096
"
,
"--disable-log-requests"
]
MORE_ARGS_LIST
=
[
MORE_ARGS_LIST
=
[
[],
# Default
[],
# Default
[
"--enable-chunked-prefill"
],
# Chunked
[
"--enable-chunked-prefill"
],
# Chunked
...
@@ -68,14 +68,21 @@ def run_test(more_args):
...
@@ -68,14 +68,21 @@ def run_test(more_args):
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
()
reason
=
"V1 currently only supported on CUDA"
)
and
not
current_platform
.
is_tpu
(),
reason
=
"V1 currently only supported on CUDA and TPU"
)
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
):
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
):
"""Run with the V1 Engine."""
"""Run with the V1 Engine."""
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
run_test
([])
more_args
=
[]
# Limit compilation time for V1
if
current_platform
.
is_tpu
():
more_args
=
[
"--max-num-seqs"
,
"64"
]
run_test
(
more_args
)
@
pytest
.
mark
.
parametrize
(
"more_args"
,
MORE_ARGS_LIST
)
@
pytest
.
mark
.
parametrize
(
"more_args"
,
MORE_ARGS_LIST
)
...
...
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
"""
Evaluate Transcription API correctness by computing Word Error Rate (WER)
on a given ASR dataset. When provided, it will also compare the WER against
a baseline.
This simulates real work usage of the API and makes sure that the frontend and
AsyncLLMEngine are working correctly.
"""
import
asyncio
import
io
import
time
from
statistics
import
mean
,
median
from
typing
import
List
import
librosa
import
pytest
import
soundfile
import
torch
from
datasets
import
load_dataset
from
evaluate
import
load
from
transformers
import
AutoTokenizer
from
....utils
import
RemoteOpenAIServer
def
to_bytes
(
y
,
sr
):
buffer
=
io
.
BytesIO
()
soundfile
.
write
(
buffer
,
y
,
sr
,
format
=
"WAV"
)
buffer
.
seek
(
0
)
return
buffer
async
def
transcribe_audio
(
client
,
tokenizer
,
y
,
sr
):
# Send loaded audio directly instead of loading from disk,
# dont account for that time though
with
to_bytes
(
y
,
sr
)
as
f
:
start_time
=
time
.
perf_counter
()
transcription
=
await
client
.
audio
.
transcriptions
.
create
(
file
=
f
,
model
=
tokenizer
.
name_or_path
,
language
=
"en"
,
temperature
=
0.0
,
)
end_time
=
time
.
perf_counter
()
# NOTE there's no streaming in transcriptions, can't measure ttft
latency
=
end_time
-
start_time
num_output_tokens
=
len
(
tokenizer
(
transcription
.
text
,
add_special_tokens
=
False
).
input_ids
)
return
latency
,
num_output_tokens
,
transcription
.
text
async
def
bound_transcribe
(
model_name
,
sem
,
client
,
audio
,
reference
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
# Use semaphore to limit concurrent requests.
async
with
sem
:
result
=
await
transcribe_audio
(
client
,
tokenizer
,
*
audio
)
# Normalize *english* output/reference for evaluation.
out
=
tokenizer
.
normalize
(
result
[
2
])
ref
=
tokenizer
.
normalize
(
reference
)
return
result
[:
2
]
+
(
out
,
ref
)
async
def
process_dataset
(
model
,
client
,
data
,
concurrent_request
):
sem
=
asyncio
.
Semaphore
(
concurrent_request
)
# Warmup call as the first `librosa.load` server-side is quite slow.
audio
,
sr
=
data
[
0
][
"audio"
][
"array"
],
data
[
0
][
"audio"
][
"sampling_rate"
]
_
=
await
bound_transcribe
(
model
,
sem
,
client
,
(
audio
,
sr
),
""
)
tasks
:
List
[
asyncio
.
Task
]
=
[]
for
sample
in
data
:
audio
,
sr
=
sample
[
"audio"
][
"array"
],
sample
[
"audio"
][
"sampling_rate"
]
task
=
asyncio
.
create_task
(
bound_transcribe
(
model
,
sem
,
client
,
(
audio
,
sr
),
sample
[
"text"
]))
tasks
.
append
(
task
)
return
await
asyncio
.
gather
(
*
tasks
)
def
print_performance_metrics
(
results
,
total_time
):
latencies
=
[
res
[
0
]
for
res
in
results
]
total_tokens
=
sum
([
res
[
1
]
for
res
in
results
])
total
=
len
(
results
)
print
(
f
"Total Requests:
{
total
}
"
)
print
(
f
"Successful Requests:
{
len
(
latencies
)
}
"
)
print
(
f
"Average Latency:
{
mean
(
latencies
):.
4
f
}
seconds"
)
print
(
f
"Median Latency:
{
median
(
latencies
):.
4
f
}
seconds"
)
perc
=
sorted
(
latencies
)[
int
(
len
(
latencies
)
*
0.95
)
-
1
]
print
(
f
"95th Percentile Latency:
{
perc
:.
4
f
}
seconds"
)
# Throughput
req_throughput
=
len
(
latencies
)
/
total_time
print
(
f
"Estimated req_Throughput:
{
req_throughput
:.
2
f
}
requests/s"
)
throughput
=
total_tokens
/
total_time
print
(
f
"Estimated Throughput:
{
throughput
:.
2
f
}
tok/s"
)
def
add_duration
(
sample
):
y
,
sr
=
sample
[
'audio'
][
"array"
],
sample
[
'audio'
][
"sampling_rate"
]
sample
[
'duration_ms'
]
=
librosa
.
get_duration
(
y
=
y
,
sr
=
sr
)
*
1000
return
sample
def
load_hf_dataset
(
dataset_repo
:
str
,
split
=
'validation'
,
**
hf_kwargs
):
## Load and filter the dataset
dataset
=
load_dataset
(
dataset_repo
,
split
=
split
,
**
hf_kwargs
)
if
'duration_ms'
not
in
dataset
[
0
]:
# compute duration to filter
dataset
=
dataset
.
map
(
add_duration
)
# Whisper max supported duration
dataset
=
dataset
.
filter
(
lambda
example
:
example
[
'duration_ms'
]
<
30000
)
return
dataset
def
run_evaluation
(
model
:
str
,
client
,
dataset
,
max_concurrent_reqs
:
int
,
n_examples
:
int
=
-
1
,
print_metrics
:
bool
=
True
):
if
n_examples
>
0
:
dataset
=
dataset
.
select
(
range
(
n_examples
))
start
=
time
.
perf_counter
()
results
=
asyncio
.
run
(
process_dataset
(
model
,
client
,
dataset
,
max_concurrent_reqs
))
end
=
time
.
perf_counter
()
total_time
=
end
-
start
print
(
f
"Total Test Time:
{
total_time
:.
4
f
}
seconds"
)
if
print_metrics
:
print_performance_metrics
(
results
,
total_time
)
# Compute WER
predictions
=
[
res
[
2
]
for
res
in
results
]
references
=
[
res
[
3
]
for
res
in
results
]
wer
=
load
(
"wer"
)
wer_score
=
100
*
wer
.
compute
(
references
=
references
,
predictions
=
predictions
)
print
(
"WER:"
,
wer_score
)
return
wer_score
# alternatives "openai/whisper-large-v2", "openai/whisper-large-v3-turbo"..
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
"openai/whisper-large-v3"
])
# Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
@
pytest
.
mark
.
parametrize
(
"dataset_repo"
,
[
"D4nt3/esb-datasets-earnings22-validation-tiny-filtered"
])
# NOTE: Expected WER measured with equivalent hf.transformers args:
# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
@
pytest
.
mark
.
parametrize
(
"expected_wer"
,
[
12.744980
])
def
test_wer_correctness
(
model_name
,
dataset_repo
,
expected_wer
,
n_examples
=-
1
,
max_concurrent_request
=
None
):
with
RemoteOpenAIServer
(
model_name
,
[
'--enforce-eager'
])
as
remote_server
:
dataset
=
load_hf_dataset
(
dataset_repo
)
if
not
max_concurrent_request
:
# No max concurrency
max_concurrent_request
=
n_examples
if
n_examples
>
0
\
else
len
(
dataset
)
client
=
remote_server
.
get_async_client
()
wer
=
run_evaluation
(
model_name
,
client
,
dataset
,
max_concurrent_request
,
n_examples
)
if
expected_wer
:
torch
.
testing
.
assert_close
(
wer
,
expected_wer
,
atol
=
1e-1
,
rtol
=
1e-2
)
tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
View file @
ec5e299c
...
@@ -15,32 +15,62 @@ start_token = "<think>"
...
@@ -15,32 +15,62 @@ start_token = "<think>"
end_token
=
"</think>"
end_token
=
"</think>"
SIMPLE_REASONING
=
{
SIMPLE_REASONING
=
{
"output"
:
"
<think>
This is a reasoning section</think>This is the rest"
,
"output"
:
"This is a reasoning section</think>This is the rest"
,
"reasoning_content"
:
"This is a reasoning section"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
}
}
COMPLETE_REASONING
=
{
COMPLETE_REASONING
=
{
"output"
:
"
<think>
This is a reasoning section</think>"
,
"output"
:
"This is a reasoning section</think>"
,
"reasoning_content"
:
"This is a reasoning section"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
None
,
"content"
:
None
,
}
}
NO_REASONING
=
{
NO_CONTENT
=
{
"output"
:
"This is content"
,
"reasoning_content"
:
"This is content"
,
"content"
:
None
,
}
NO_REASONING_STREAMING
=
{
"output"
:
"This is a reasoning section"
,
"output"
:
"This is a reasoning section"
,
"reasoning_content"
:
None
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
"This is a reasoning section"
,
"content"
:
None
,
}
}
MULTIPLE_LINES
=
{
MULTIPLE_LINES
=
{
"output"
:
"
<think>
This
\n
That</think>This is the rest
\n
That"
,
"output"
:
"This
\n
That</think>This is the rest
\n
That"
,
"reasoning_content"
:
"This
\n
That"
,
"reasoning_content"
:
"This
\n
That"
,
"content"
:
"This is the rest
\n
That"
,
"content"
:
"This is the rest
\n
That"
,
}
}
SHORTEST_REASONING_NO_STREAMING
=
{
SHORTEST_REASONING_NO_STREAMING
=
{
"output"
:
"<
think><
/think>This is the rest"
,
"output"
:
"</think>This is the rest"
,
"reasoning_content"
:
""
,
"reasoning_content"
:
""
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
}
}
SHORTEST_REASONING
=
{
SHORTEST_REASONING
=
{
"output"
:
"<think></think>This is the rest"
,
"output"
:
"</think>This is the rest"
,
"reasoning_content"
:
None
,
"content"
:
"This is the rest"
,
}
REASONING_WITH_THINK
=
{
"output"
:
"<think>This is a reasoning section</think>This is the rest"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
}
COMPLETE_REASONING_WITH_THINK
=
{
"output"
:
"<think>This is a reasoning section</think>"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
None
,
}
MULTIPLE_LINES_WITH_THINK
=
{
"output"
:
"<think>This
\n
That</think>This is the rest
\n
That"
,
"reasoning_content"
:
"This
\n
That"
,
"content"
:
"This is the rest
\n
That"
,
}
SHORTEST_REASONING_NO_STREAMING_WITH_THINK
=
{
"output"
:
"</think>This is the rest"
,
"reasoning_content"
:
""
,
"content"
:
"This is the rest"
,
}
SHORTEST_REASONING_WITH_THINK
=
{
"output"
:
"</think>This is the rest"
,
"reasoning_content"
:
None
,
"reasoning_content"
:
None
,
"content"
:
"This is the rest"
,
"content"
:
"This is the rest"
,
}
}
...
@@ -49,37 +79,37 @@ TEST_CASES = [
...
@@ -49,37 +79,37 @@ TEST_CASES = [
pytest
.
param
(
pytest
.
param
(
False
,
False
,
SIMPLE_REASONING
,
SIMPLE_REASONING
,
id
=
"simple_
st
rea
m
ing"
,
id
=
"simple_rea
son
ing"
,
),
),
pytest
.
param
(
pytest
.
param
(
True
,
True
,
SIMPLE_REASONING
,
SIMPLE_REASONING
,
id
=
"simple_streaming"
,
id
=
"simple_
reasoning_
streaming"
,
),
),
pytest
.
param
(
pytest
.
param
(
False
,
False
,
COMPLETE_REASONING
,
COMPLETE_REASONING
,
id
=
"complete_
st
rea
m
ing"
,
id
=
"complete_rea
son
ing"
,
),
),
pytest
.
param
(
pytest
.
param
(
True
,
True
,
COMPLETE_REASONING
,
COMPLETE_REASONING
,
id
=
"complete_streaming"
,
id
=
"complete_
reasoning_
streaming"
,
),
),
pytest
.
param
(
pytest
.
param
(
False
,
False
,
NO_
REASONING
,
NO_
CONTENT
,
id
=
"no_
streaming
"
,
id
=
"no_
content_token
"
,
),
),
pytest
.
param
(
pytest
.
param
(
True
,
True
,
NO_REASONING
,
NO_REASONING
_STREAMING
,
id
=
"no_streaming"
,
id
=
"no_
reasoning_token_
streaming"
,
),
),
pytest
.
param
(
pytest
.
param
(
False
,
False
,
MULTIPLE_LINES
,
MULTIPLE_LINES
,
id
=
"multiple_lines
_streaming
"
,
id
=
"multiple_lines"
,
),
),
pytest
.
param
(
pytest
.
param
(
True
,
True
,
...
@@ -89,23 +119,65 @@ TEST_CASES = [
...
@@ -89,23 +119,65 @@ TEST_CASES = [
pytest
.
param
(
pytest
.
param
(
True
,
True
,
SHORTEST_REASONING
,
SHORTEST_REASONING
,
id
=
"shortest
_streaming
"
,
id
=
"shortest"
,
),
),
pytest
.
param
(
pytest
.
param
(
False
,
False
,
SHORTEST_REASONING_NO_STREAMING
,
SHORTEST_REASONING_NO_STREAMING
,
id
=
"shortest_streaming"
,
id
=
"shortest_streaming"
,
),
),
pytest
.
param
(
False
,
REASONING_WITH_THINK
,
id
=
"reasoning_with_think"
,
),
pytest
.
param
(
True
,
REASONING_WITH_THINK
,
id
=
"reasoning_with_think_streaming"
,
),
pytest
.
param
(
False
,
COMPLETE_REASONING_WITH_THINK
,
id
=
"complete_reasoning_with_think"
,
),
pytest
.
param
(
True
,
COMPLETE_REASONING_WITH_THINK
,
id
=
"complete_reasoning_with_think_streaming"
,
),
pytest
.
param
(
False
,
MULTIPLE_LINES_WITH_THINK
,
id
=
"multiple_lines_with_think"
,
),
pytest
.
param
(
True
,
MULTIPLE_LINES_WITH_THINK
,
id
=
"multiple_lines_with_think_streaming"
,
),
pytest
.
param
(
False
,
SHORTEST_REASONING_NO_STREAMING_WITH_THINK
,
id
=
"shortest_with_think"
,
),
pytest
.
param
(
True
,
SHORTEST_REASONING_WITH_THINK
,
id
=
"shortest_with_think_streaming"
,
),
]
]
# Global tokenizer initialization to avoid repeated loading
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"facebook/opt-125m"
)
tokenizer
.
add_tokens
([
start_token
,
end_token
])
@
pytest
.
mark
.
parametrize
(
"streaming, param_dict"
,
TEST_CASES
)
@
pytest
.
mark
.
parametrize
(
"streaming, param_dict"
,
TEST_CASES
)
def
test_reasoning
(
def
test_reasoning
(
streaming
:
bool
,
streaming
:
bool
,
param_dict
:
dict
,
param_dict
:
dict
,
):
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"facebook/opt-125m"
)
tokenizer
.
add_tokens
([
start_token
,
end_token
])
output
=
tokenizer
.
tokenize
(
param_dict
[
"output"
])
output
=
tokenizer
.
tokenize
(
param_dict
[
"output"
])
# decode everything to tokens
# decode everything to tokens
output_tokens
:
List
[
str
]
=
[
output_tokens
:
List
[
str
]
=
[
...
...
tests/entrypoints/openai/test_audio.py
View file @
ec5e299c
...
@@ -12,10 +12,7 @@ from vllm.multimodal.utils import encode_audio_base64, fetch_audio
...
@@ -12,10 +12,7 @@ from vllm.multimodal.utils import encode_audio_base64, fetch_audio
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_3"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
)
# TEST_AUDIO_URLS = [
# AudioAsset("winning_call").url,
# ]
TEST_AUDIO_URLS
=
[
TEST_AUDIO_URLS
=
[
"http://localhost:8000/winning_call.ogg"
"http://localhost:8000/winning_call.ogg"
]
]
...
@@ -86,7 +83,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
...
@@ -86,7 +83,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
20
2
,
total_tokens
=
21
2
)
completion_tokens
=
10
,
prompt_tokens
=
20
1
,
total_tokens
=
21
1
)
message
=
choice
.
message
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
...
@@ -143,7 +140,7 @@ async def test_single_chat_session_audio_base64encoded(
...
@@ -143,7 +140,7 @@ async def test_single_chat_session_audio_base64encoded(
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
20
2
,
total_tokens
=
21
2
)
completion_tokens
=
10
,
prompt_tokens
=
20
1
,
total_tokens
=
21
1
)
message
=
choice
.
message
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
...
@@ -199,7 +196,7 @@ async def test_single_chat_session_input_audio(
...
@@ -199,7 +196,7 @@ async def test_single_chat_session_input_audio(
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
20
2
,
total_tokens
=
21
2
)
completion_tokens
=
10
,
prompt_tokens
=
20
1
,
total_tokens
=
21
1
)
message
=
choice
.
message
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
...
...
tests/entrypoints/openai/test_basic.py
View file @
ec5e299c
...
@@ -157,3 +157,19 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
...
@@ -157,3 +157,19 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
max_tokens
=
10
)
max_tokens
=
10
)
assert
len
(
response
.
choices
)
==
1
assert
len
(
response
.
choices
)
==
1
@
pytest
.
mark
.
asyncio
async
def
test_request_wrong_content_type
(
server
:
RemoteOpenAIServer
):
chat_input
=
[{
"role"
:
"user"
,
"content"
:
"Write a long story"
}]
client
=
server
.
get_async_client
()
with
pytest
.
raises
(
openai
.
APIStatusError
):
await
client
.
chat
.
completions
.
create
(
messages
=
chat_input
,
model
=
MODEL_NAME
,
max_tokens
=
10000
,
extra_headers
=
{
"Content-Type"
:
"application/x-www-form-urlencoded"
})
tests/entrypoints/openai/test_metrics.py
View file @
ec5e299c
...
@@ -86,6 +86,10 @@ EXPECTED_VALUES = {
...
@@ -86,6 +86,10 @@ EXPECTED_VALUES = {
"vllm:time_per_output_token_seconds"
:
"vllm:time_per_output_token_seconds"
:
[(
"_count"
,
_NUM_REQUESTS
*
(
_NUM_GENERATION_TOKENS_PER_REQUEST
-
1
))],
[(
"_count"
,
_NUM_REQUESTS
*
(
_NUM_GENERATION_TOKENS_PER_REQUEST
-
1
))],
"vllm:e2e_request_latency_seconds"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:e2e_request_latency_seconds"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_queue_time_seconds"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_inference_time_seconds"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_prefill_time_seconds"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_decode_time_seconds"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_prompt_tokens"
:
"vllm:request_prompt_tokens"
:
[(
"_sum"
,
_NUM_REQUESTS
*
_NUM_PROMPT_TOKENS_PER_REQUEST
),
[(
"_sum"
,
_NUM_REQUESTS
*
_NUM_PROMPT_TOKENS_PER_REQUEST
),
(
"_count"
,
_NUM_REQUESTS
)],
(
"_count"
,
_NUM_REQUESTS
)],
...
@@ -93,9 +97,14 @@ EXPECTED_VALUES = {
...
@@ -93,9 +97,14 @@ EXPECTED_VALUES = {
[(
"_sum"
,
_NUM_REQUESTS
*
_NUM_GENERATION_TOKENS_PER_REQUEST
),
[(
"_sum"
,
_NUM_REQUESTS
*
_NUM_GENERATION_TOKENS_PER_REQUEST
),
(
"_count"
,
_NUM_REQUESTS
)],
(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_params_n"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_params_n"
:
[(
"_count"
,
_NUM_REQUESTS
)],
"vllm:request_params_max_tokens"
:
"vllm:request_params_max_tokens"
:
[
[(
"_sum"
,
_NUM_REQUESTS
*
_NUM_GENERATION_TOKENS_PER_REQUEST
),
(
"_sum"
,
_NUM_REQUESTS
*
_NUM_GENERATION_TOKENS_PER_REQUEST
),
(
"_count"
,
_NUM_REQUESTS
)],
(
"_count"
,
_NUM_REQUESTS
)
],
"vllm:iteration_tokens_total"
:
[(
"_sum"
,
_NUM_REQUESTS
*
(
_NUM_PROMPT_TOKENS_PER_REQUEST
+
_NUM_GENERATION_TOKENS_PER_REQUEST
)),
(
"_count"
,
_NUM_REQUESTS
*
_NUM_GENERATION_TOKENS_PER_REQUEST
)],
"vllm:prompt_tokens"
:
[(
"_total"
,
"vllm:prompt_tokens"
:
[(
"_total"
,
_NUM_REQUESTS
*
_NUM_PROMPT_TOKENS_PER_REQUEST
)],
_NUM_REQUESTS
*
_NUM_PROMPT_TOKENS_PER_REQUEST
)],
"vllm:generation_tokens"
:
[
"vllm:generation_tokens"
:
[
...
@@ -170,6 +179,18 @@ EXPECTED_METRICS = [
...
@@ -170,6 +179,18 @@ EXPECTED_METRICS = [
"vllm:e2e_request_latency_seconds_sum"
,
"vllm:e2e_request_latency_seconds_sum"
,
"vllm:e2e_request_latency_seconds_bucket"
,
"vllm:e2e_request_latency_seconds_bucket"
,
"vllm:e2e_request_latency_seconds_count"
,
"vllm:e2e_request_latency_seconds_count"
,
"vllm:request_queue_time_seconds_sum"
,
"vllm:request_queue_time_seconds_bucket"
,
"vllm:request_queue_time_seconds_count"
,
"vllm:request_inference_time_seconds_sum"
,
"vllm:request_inference_time_seconds_bucket"
,
"vllm:request_inference_time_seconds_count"
,
"vllm:request_prefill_time_seconds_sum"
,
"vllm:request_prefill_time_seconds_bucket"
,
"vllm:request_prefill_time_seconds_count"
,
"vllm:request_decode_time_seconds_sum"
,
"vllm:request_decode_time_seconds_bucket"
,
"vllm:request_decode_time_seconds_count"
,
"vllm:request_prompt_tokens_sum"
,
"vllm:request_prompt_tokens_sum"
,
"vllm:request_prompt_tokens_bucket"
,
"vllm:request_prompt_tokens_bucket"
,
"vllm:request_prompt_tokens_count"
,
"vllm:request_prompt_tokens_count"
,
...
@@ -182,6 +203,7 @@ EXPECTED_METRICS = [
...
@@ -182,6 +203,7 @@ EXPECTED_METRICS = [
"vllm:request_params_max_tokens_sum"
,
"vllm:request_params_max_tokens_sum"
,
"vllm:request_params_max_tokens_bucket"
,
"vllm:request_params_max_tokens_bucket"
,
"vllm:request_params_max_tokens_count"
,
"vllm:request_params_max_tokens_count"
,
"vllm:iteration_tokens_total"
,
"vllm:num_preemptions_total"
,
"vllm:num_preemptions_total"
,
"vllm:prompt_tokens_total"
,
"vllm:prompt_tokens_total"
,
"vllm:generation_tokens_total"
,
"vllm:generation_tokens_total"
,
...
@@ -204,8 +226,11 @@ EXPECTED_METRICS_V1 = [
...
@@ -204,8 +226,11 @@ EXPECTED_METRICS_V1 = [
"vllm:num_requests_running"
,
"vllm:num_requests_running"
,
"vllm:num_requests_waiting"
,
"vllm:num_requests_waiting"
,
"vllm:gpu_cache_usage_perc"
,
"vllm:gpu_cache_usage_perc"
,
"vllm:gpu_prefix_cache_queries"
,
"vllm:gpu_prefix_cache_hits"
,
"vllm:prompt_tokens_total"
,
"vllm:prompt_tokens_total"
,
"vllm:generation_tokens_total"
,
"vllm:generation_tokens_total"
,
"vllm:iteration_tokens_total"
,
"vllm:request_success_total"
,
"vllm:request_success_total"
,
"vllm:request_prompt_tokens_sum"
,
"vllm:request_prompt_tokens_sum"
,
"vllm:request_prompt_tokens_bucket"
,
"vllm:request_prompt_tokens_bucket"
,
...
@@ -219,6 +244,21 @@ EXPECTED_METRICS_V1 = [
...
@@ -219,6 +244,21 @@ EXPECTED_METRICS_V1 = [
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_count"
,
"vllm:time_per_output_token_seconds_count"
,
"vllm:e2e_request_latency_seconds_sum"
,
"vllm:e2e_request_latency_seconds_bucket"
,
"vllm:e2e_request_latency_seconds_count"
,
"vllm:request_queue_time_seconds_sum"
,
"vllm:request_queue_time_seconds_bucket"
,
"vllm:request_queue_time_seconds_count"
,
"vllm:request_inference_time_seconds_sum"
,
"vllm:request_inference_time_seconds_bucket"
,
"vllm:request_inference_time_seconds_count"
,
"vllm:request_prefill_time_seconds_sum"
,
"vllm:request_prefill_time_seconds_bucket"
,
"vllm:request_prefill_time_seconds_count"
,
"vllm:request_decode_time_seconds_sum"
,
"vllm:request_decode_time_seconds_bucket"
,
"vllm:request_decode_time_seconds_count"
,
]
]
...
...
tests/entrypoints/openai/test_rerank.py
View file @
ec5e299c
...
@@ -86,4 +86,4 @@ def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
...
@@ -86,4 +86,4 @@ def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
assert
rerank_response
.
status_code
==
400
assert
rerank_response
.
status_code
==
400
# Assert just a small fragments of the response
# Assert just a small fragments of the response
assert
"Please reduce the length of the input."
in
\
assert
"Please reduce the length of the input."
in
\
rerank_response
.
text
rerank_response
.
text
\ No newline at end of file
tests/entrypoints/openai/test_serving_models.py
View file @
ec5e299c
...
@@ -16,7 +16,7 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
...
@@ -16,7 +16,7 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-
2-7b
"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-
3.2-1B-Instruct
"
)
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
)]
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
)]
LORA_LOADING_SUCCESS_MESSAGE
=
(
LORA_LOADING_SUCCESS_MESSAGE
=
(
"Success: LoRA adapter '{lora_name}' added successfully."
)
"Success: LoRA adapter '{lora_name}' added successfully."
)
...
...
tests/entrypoints/openai/test_shutdown.py
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
os
import
openai
import
openai
import
pytest
import
pytest
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B
-Instruct
"
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_sleep.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
import
requests
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"meta-llama/Llama-3.2-1B"
def
test_sleep_mode
():
# dtype, max-len etc set so that this can run in CI
args
=
[
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--max-num-seqs"
,
"128"
,
"--enable-sleep-mode"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
,
env_dict
=
{
"VLLM_SERVER_DEV_MODE"
:
"1"
,
"CUDA_VISIBLE_DEVICES"
:
"0"
})
as
remote_server
:
response
=
requests
.
post
(
remote_server
.
url_for
(
"/sleep"
),
data
=
{
"level"
:
"1"
})
assert
response
.
status_code
==
200
response
=
requests
.
post
(
remote_server
.
url_for
(
"/wake_up"
))
assert
response
.
status_code
==
200
tests/entrypoints/openai/test_transcription_validation.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
# imports for guided decoding tests
import
io
import
json
import
librosa
import
numpy
as
np
import
openai
import
pytest
import
soundfile
as
sf
from
vllm.assets.audio
import
AudioAsset
from
...utils
import
RemoteOpenAIServer
@
pytest
.
fixture
def
mary_had_lamb
():
path
=
AudioAsset
(
'mary_had_lamb'
).
get_local_path
()
with
open
(
str
(
path
),
"rb"
)
as
f
:
yield
f
@
pytest
.
fixture
def
winning_call
():
path
=
AudioAsset
(
'winning_call'
).
get_local_path
()
with
open
(
str
(
path
),
"rb"
)
as
f
:
yield
f
@
pytest
.
mark
.
asyncio
async
def
test_basic_audio
(
mary_had_lamb
):
model_name
=
"openai/whisper-large-v3-turbo"
server_args
=
[
"--enforce-eager"
]
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
prompt
=
"THE FIRST WORDS I SPOKE"
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
transcription
=
await
client
.
audio
.
transcriptions
.
create
(
model
=
model_name
,
file
=
mary_had_lamb
,
language
=
"en"
,
response_format
=
"text"
,
temperature
=
0.0
)
out
=
json
.
loads
(
transcription
)[
'text'
]
assert
"Mary had a little lamb,"
in
out
# This should "force" whisper to continue prompt in all caps
transcription_wprompt
=
await
client
.
audio
.
transcriptions
.
create
(
model
=
model_name
,
file
=
mary_had_lamb
,
language
=
"en"
,
response_format
=
"text"
,
prompt
=
prompt
,
temperature
=
0.0
)
out_capital
=
json
.
loads
(
transcription_wprompt
)[
'text'
]
assert
prompt
not
in
out_capital
@
pytest
.
mark
.
asyncio
async
def
test_bad_requests
(
mary_had_lamb
):
model_name
=
"openai/whisper-small"
server_args
=
[
"--enforce-eager"
]
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
# invalid language
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
audio
.
transcriptions
.
create
(
model
=
model_name
,
file
=
mary_had_lamb
,
language
=
"hh"
,
temperature
=
0.0
)
# Expect audio too long: repeat the timeseries
mary_had_lamb
.
seek
(
0
)
audio
,
sr
=
librosa
.
load
(
mary_had_lamb
)
repeated_audio
=
np
.
tile
(
audio
,
10
)
# Repeated audio to buffer
buffer
=
io
.
BytesIO
()
sf
.
write
(
buffer
,
repeated_audio
,
sr
,
format
=
'WAV'
)
buffer
.
seek
(
0
)
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
audio
.
transcriptions
.
create
(
model
=
model_name
,
file
=
buffer
,
language
=
"en"
,
temperature
=
0.0
)
@
pytest
.
mark
.
asyncio
async
def
test_non_asr_model
(
winning_call
):
# text to text model
model_name
=
"JackFram/llama-68m"
server_args
=
[
"--enforce-eager"
]
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
res
=
await
client
.
audio
.
transcriptions
.
create
(
model
=
model_name
,
file
=
winning_call
,
language
=
"en"
,
temperature
=
0.0
)
assert
res
.
code
==
400
and
not
res
.
text
assert
res
.
message
==
"The model does not support Transcriptions API"
@
pytest
.
mark
.
asyncio
async
def
test_completion_endpoints
():
# text to text model
model_name
=
"openai/whisper-small"
server_args
=
[
"--enforce-eager"
]
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
res
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
}])
assert
res
.
code
==
400
assert
res
.
message
==
"The model does not support Chat Completions API"
res
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello"
)
assert
res
.
code
==
400
assert
res
.
message
==
"The model does not support Completions API"
tests/entrypoints/openai/test_vision.py
View file @
ec5e299c
...
@@ -101,7 +101,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
...
@@ -101,7 +101,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
77
5
,
total_tokens
=
78
5
)
completion_tokens
=
10
,
prompt_tokens
=
77
4
,
total_tokens
=
78
4
)
message
=
choice
.
message
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
...
@@ -194,7 +194,7 @@ async def test_single_chat_session_image_base64encoded(
...
@@ -194,7 +194,7 @@ async def test_single_chat_session_image_base64encoded(
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
77
5
,
total_tokens
=
78
5
)
completion_tokens
=
10
,
prompt_tokens
=
77
4
,
total_tokens
=
78
4
)
message
=
choice
.
message
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
...
...
tests/entrypoints/openai/test_vision_embedding.py
View file @
ec5e299c
...
@@ -101,5 +101,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
...
@@ -101,5 +101,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
3072
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
3072
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
76
4
assert
embeddings
.
usage
.
prompt_tokens
==
76
3
assert
embeddings
.
usage
.
total_tokens
==
76
4
assert
embeddings
.
usage
.
total_tokens
==
76
3
tests/entrypoints/test_chat_utils.py
View file @
ec5e299c
...
@@ -23,7 +23,7 @@ from ..utils import VLLM_PATH
...
@@ -23,7 +23,7 @@ from ..utils import VLLM_PATH
EXAMPLES_DIR
=
VLLM_PATH
/
"examples"
EXAMPLES_DIR
=
VLLM_PATH
/
"examples"
PHI3V_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)
PHI3V_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)
ULTRAVOX_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
3
"
)
ULTRAVOX_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
5-llama-3_2-1b
"
)
QWEN2VL_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
)
QWEN2VL_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
)
MLLAMA_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-11B-Vision-Instruct"
)
MLLAMA_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-11B-Vision-Instruct"
)
LLAMA_GUARD_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-Guard-3-1B"
)
LLAMA_GUARD_MODEL_ID
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-Guard-3-1B"
)
...
...
tests/kernels/test_cutlass_2of4_sparse.py
View file @
ec5e299c
...
@@ -7,7 +7,6 @@ from typing import Tuple, Type
...
@@ -7,7 +7,6 @@ from typing import Tuple, Type
import
pytest
import
pytest
import
torch
import
torch
import
torch.nn.functional
as
F
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
...
@@ -55,11 +54,39 @@ def prune_to_2_4(tensor):
...
@@ -55,11 +54,39 @@ def prune_to_2_4(tensor):
return
pruned
.
reshape
(
original_shape
)
return
pruned
.
reshape
(
original_shape
)
# This function checks that applying an identity matrix multiplication
# to the compressed weights yields the original uncompressed weights.
def
check_compress_decompress_invariance
(
dtype
:
torch
.
dtype
,
b
:
torch
.
Tensor
,
b_compressed
:
torch
.
Tensor
,
b_metadata
:
torch
.
Tensor
):
# For float16 and bfloat16, cutlass_scaled_sparse_mm's output must be the
# same dtype as its inputs. This line addresses that constraint while
# arbitrarily using bfloat16 for the int8/fp8 cases.
out_dtype
=
torch
.
float16
if
dtype
is
torch
.
float16
else
torch
.
bfloat16
eye
=
torch
.
eye
(
b
.
shape
[
0
],
device
=
'cuda'
,
dtype
=
dtype
)
eye_scale
=
torch
.
ones
(
1
,
device
=
'cuda'
,
dtype
=
torch
.
float32
)
b_decomp
=
ops
.
cutlass_scaled_sparse_mm
(
eye
,
b_compressed
,
b_metadata
,
eye_scale
,
eye_scale
,
out_dtype
=
out_dtype
)
torch
.
testing
.
assert_close
(
b
.
to
(
dtype
=
out_dtype
),
b_decomp
)
def
make_rand_sparse_tensors
(
def
make_rand_sparse_tensors
(
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
a
=
torch
.
randn
((
m
,
k
),
device
=
'cuda'
)
*
5
a
=
torch
.
randn
((
m
,
k
),
device
=
'cuda'
)
b
=
torch
.
randn
((
n
,
k
),
device
=
'cuda'
).
t
()
*
5
b
=
torch
.
randn
((
n
,
k
),
device
=
'cuda'
).
t
()
if
dtype
==
torch
.
int8
:
# ensure A and B aren't all zeros after rounding
a
=
a
*
5.0
b
=
b
*
5.0
b
=
prune_to_2_4
(
b
.
t
()).
t
()
b
=
prune_to_2_4
(
b
.
t
()).
t
()
...
@@ -75,6 +102,7 @@ def make_rand_sparse_tensors(
...
@@ -75,6 +102,7 @@ def make_rand_sparse_tensors(
raise
ValueError
(
"unsupported dtype"
)
raise
ValueError
(
"unsupported dtype"
)
b_compressed
,
e
=
ops
.
cutlass_sparse_compress
(
b
.
t
())
b_compressed
,
e
=
ops
.
cutlass_sparse_compress
(
b
.
t
())
check_compress_decompress_invariance
(
dtype
,
b
,
b_compressed
,
e
)
# Compressed B, Metadata, Original A, B
# Compressed B, Metadata, Original A, B
return
b_compressed
,
e
,
a
,
b
return
b_compressed
,
e
,
a
,
b
...
@@ -134,27 +162,37 @@ MNK_FACTORS = [
...
@@ -134,27 +162,37 @@ MNK_FACTORS = [
# Test working with a subset of A and B for sparse matmul
# Test working with a subset of A and B for sparse matmul
@
pytest
.
mark
.
skip
(
reason
=
"2of4 sparse w16a16 CUTLASS produces bad output."
)
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
reason
=
"Sparse CUTLASS is not supported on this GPU type."
)
reason
=
"Sparse CUTLASS is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"m,
k
,
n
"
,
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"m,
n
,
k
"
,
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
,
torch
.
float16
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
,
torch
.
float16
])
def
test_cutlass_sparse_gemm
(
m
:
int
,
k
:
int
,
n
:
int
,
dtype
:
Type
[
torch
.
dtype
]):
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
def
test_cutlass_sparse_gemm
(
m
:
int
,
k
:
int
,
n
:
int
,
dtype
:
Type
[
torch
.
dtype
],
use_bias
:
bool
):
# Create tensors
# Create tensors
b_comp
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
dtype
,
m
,
n
,
k
)
b_comp
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
dtype
,
m
,
n
,
k
)
scale_a
=
torch
.
ones
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
scale_a
=
torch
.
ones
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
ones
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
ones
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
bias
=
torch
.
rand
((
n
,
),
device
=
"cuda"
,
dtype
=
dtype
)
if
use_bias
else
None
out
=
ops
.
cutlass_scaled_sparse_mm
(
a
,
out
=
ops
.
cutlass_scaled_sparse_mm
(
a
,
b_comp
,
b_comp
,
e
,
e
,
scale_a
,
scale_a
,
scale_b
,
scale_b
,
out_dtype
=
dtype
)
out_dtype
=
dtype
,
bas
eline
=
F
.
linear
(
a
,
b
.
T
)
b
i
as
=
bias
)
torch
.
testing
.
assert_close
(
out
,
baseline
,
rtol
=
1e-2
,
atol
=
1e-2
)
baseline
=
baseline_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
=
dtype
,
bias
=
bias
)
torch
.
testing
.
assert_close
(
out
,
baseline
,
rtol
=
1e-2
,
atol
=
3e-1
)
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
...
@@ -162,27 +200,34 @@ def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype]):
...
@@ -162,27 +200,34 @@ def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype]):
@
pytest
.
mark
.
parametrize
(
"m, k, n"
,
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"m, k, n"
,
MNK_FACTORS
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_capability
(
89
),
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_capability
(
89
),
reason
=
"FP8 is not supported on this GPU type."
)
reason
=
"FP8 is not supported on this GPU type."
)
def
test_cutlass_sparse_fp8_gemm
(
m
:
int
,
n
:
int
,
k
:
int
):
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
def
test_cutlass_sparse_fp8_gemm
(
m
:
int
,
n
:
int
,
k
:
int
,
use_bias
:
bool
):
# Create tensors
# Create tensors
b_comp
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
torch
.
float8_e4m3fn
,
m
,
n
,
k
)
b_comp
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
torch
.
float8_e4m3fn
,
m
,
n
,
k
)
scale_a
=
(
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
))
scale_a
=
(
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
))
scale_b
=
(
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
))
scale_b
=
(
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
))
out_dtype
=
torch
.
bfloat16
bias
=
torch
.
rand
(
(
n
,
),
device
=
"cuda"
,
dtype
=
out_dtype
)
*
10
if
use_bias
else
None
out
=
ops
.
cutlass_scaled_sparse_mm
(
a
,
out
=
ops
.
cutlass_scaled_sparse_mm
(
a
,
b_comp
,
b_comp
,
e
,
e
,
scale_a
,
scale_a
,
scale_b
,
scale_b
,
out_dtype
=
torch
.
bfloat16
)
out_dtype
=
out_dtype
,
bias
=
bias
)
baseline
=
baseline_scaled_mm
(
a
,
baseline
=
baseline_scaled_mm
(
a
,
b
,
b
,
scale_a
,
scale_a
,
scale_b
,
scale_b
,
out_dtype
=
torch
.
bfloat16
)
out_dtype
=
out_dtype
,
bias
=
bias
)
torch
.
testing
.
assert_close
(
out
,
baseline
,
rtol
=
1e
0
,
atol
=
2e0
)
torch
.
testing
.
assert_close
(
out
,
baseline
,
rtol
=
1e
-2
,
atol
=
3e-1
)
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
...
@@ -198,18 +243,24 @@ def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
...
@@ -198,18 +243,24 @@ def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
b_comp
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
torch
.
int8
,
m
,
n
,
k
)
b_comp
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
torch
.
int8
,
m
,
n
,
k
)
scale_a
=
(
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
))
scale_a
=
(
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
))
scale_b
=
(
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
))
scale_b
=
(
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
))
out_dtype
=
torch
.
bfloat16
bias
=
torch
.
rand
(
(
n
,
),
device
=
"cuda"
,
dtype
=
out_dtype
)
*
10
if
use_bias
else
None
out
=
ops
.
cutlass_scaled_sparse_mm
(
a
,
out
=
ops
.
cutlass_scaled_sparse_mm
(
a
,
b_comp
,
b_comp
,
e
,
e
,
scale_a
,
scale_a
,
scale_b
,
scale_b
,
out_dtype
=
torch
.
bfloat16
)
out_dtype
=
out_dtype
,
bias
=
bias
)
baseline
=
baseline_scaled_mm
(
a
,
baseline
=
baseline_scaled_mm
(
a
,
b
,
b
,
scale_a
,
scale_a
,
scale_b
,
scale_b
,
out_dtype
=
torch
.
bfloat16
)
out_dtype
=
out_dtype
,
bias
=
bias
)
torch
.
testing
.
assert_close
(
out
,
baseline
,
rtol
=
1e0
,
atol
=
2e0
)
torch
.
testing
.
assert_close
(
out
,
baseline
,
rtol
=
1e0
,
atol
=
2e0
)
tests/kernels/test_mamba_mixer2.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
import
unittest
from
typing
import
Tuple
import
pytest
import
torch
from
tests.utils
import
multi_gpu_test
from
vllm.distributed.parallel_state
import
(
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.model_executor.layers.mamba.mamba_mixer2
import
Mixer2RMSNormGated
from
vllm.platforms
import
current_platform
from
vllm.utils
import
update_environment_variables
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"hidden_size_n_groups"
,
[
(
64
,
1
),
(
64
,
2
),
(
64
,
4
),
# hidden_size be divisible by num_gpus
(
100
,
5
),
# and n_groups must divide hidden_size
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
])
def
test_mixer2_gated_norm_multi_gpu
(
batch_size
:
int
,
seq_len
:
int
,
hidden_size_n_groups
:
Tuple
[
int
,
int
],
dtype
:
torch
.
dtype
,
device
:
str
=
'cuda'
,
):
hidden_size
,
n_groups
=
hidden_size_n_groups
num_processes
=
2
def
run_torch_spawn
(
fn
,
nprocs
):
# need to use torch.mp.spawn otherwise will have problems with
# torch.distributed and cuda
torch
.
multiprocessing
.
spawn
(
fn
,
args
=
(
num_processes
,
batch_size
,
seq_len
,
hidden_size
,
n_groups
,
dtype
,
device
,
),
nprocs
=
nprocs
)
run_torch_spawn
(
mixer2_gated_norm_tensor_parallel
,
2
)
def
mixer2_gated_norm_tensor_parallel
(
local_rank
:
int
,
world_size
:
int
,
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
,
n_groups
:
int
,
dtype
:
torch
.
dtype
,
device
:
str
,
):
current_platform
.
seed_everything
(
0
)
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
set_default_device
(
device
)
torch
.
set_default_dtype
(
dtype
)
update_environment_variables
({
'RANK'
:
str
(
local_rank
),
'LOCAL_RANK'
:
str
(
local_rank
),
'WORLD_SIZE'
:
str
(
world_size
),
'MASTER_ADDR'
:
'localhost'
,
'MASTER_PORT'
:
'12345'
,
})
# initialize distributed
init_distributed_environment
()
initialize_model_parallel
(
tensor_model_parallel_size
=
world_size
)
# create random weights an inputs
weight
=
torch
.
rand
((
hidden_size
,
),
dtype
=
dtype
,
device
=
device
)
hidden_states
=
torch
.
randn
(
batch_size
,
seq_len
,
hidden_size
)
gate_states
=
torch
.
randn
(
batch_size
,
seq_len
,
hidden_size
)
# create gated-norm with TP
mixer
=
Mixer2RMSNormGated
(
full_hidden_size
=
hidden_size
,
full_n_groups
=
n_groups
,
)
mixer
.
weight
.
weight_loader
(
mixer
.
weight
,
weight
)
# load
# create gated-norm without TP to compute reference
# - utilize mock patching to disable TP when
with
(
unittest
.
mock
.
patch
(
"vllm.model_executor.layers.mamba.mamba_mixer2."
"get_tensor_model_parallel_world_size"
,
return_value
=
1
),
unittest
.
mock
.
patch
(
"vllm.model_executor.layers.mamba.mamba_mixer2."
"get_tensor_model_parallel_rank"
,
return_value
=
0
)):
mixer_single_gpu
=
Mixer2RMSNormGated
(
full_hidden_size
=
hidden_size
,
full_n_groups
=
n_groups
,
)
# assign weight to single-gpu mixer
mixer_single_gpu
.
weight
.
data
=
weight
# generate and compare
N
=
hidden_size
//
world_size
output
=
mixer
(
hidden_states
[...,
local_rank
*
N
:(
local_rank
+
1
)
*
N
],
gate_states
[...,
local_rank
*
N
:(
local_rank
+
1
)
*
N
],
)
ref_output
=
mixer_single_gpu
(
hidden_states
,
gate_states
)
torch
.
allclose
(
output
,
ref_output
[...,
local_rank
*
N
:(
local_rank
+
1
)
*
N
],
atol
=
1e-3
,
rtol
=
1e-3
)
tests/kernels/test_mamba_ssm_ssd.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Dict
,
Tuple
import
pytest
import
torch
import
torch.nn.functional
as
F
from
einops
import
rearrange
,
repeat
from
vllm.model_executor.layers.mamba.ops.ssd_combined
import
(
mamba_chunk_scan_combined
)
from
vllm.platforms
import
current_platform
# Added by the IBM Team, 2024
# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/modules/ssd_minimal.py
# this is the segsum implementation taken from above
def
segsum
(
x
):
"""Calculates segment sum."""
T
=
x
.
size
(
-
1
)
x
=
repeat
(
x
,
"... d -> ... d e"
,
e
=
T
)
mask
=
torch
.
tril
(
torch
.
ones
(
T
,
T
,
device
=
x
.
device
,
dtype
=
bool
),
diagonal
=-
1
)
x
=
x
.
masked_fill
(
~
mask
,
0
)
x_segsum
=
torch
.
cumsum
(
x
,
dim
=-
2
)
mask
=
torch
.
tril
(
torch
.
ones
(
T
,
T
,
device
=
x
.
device
,
dtype
=
bool
),
diagonal
=
0
)
x_segsum
=
x_segsum
.
masked_fill
(
~
mask
,
-
torch
.
inf
)
return
x_segsum
def
ssd_minimal_discrete
(
X
,
A
,
B
,
C
,
block_len
,
initial_states
=
None
):
"""
Arguments:
X: (batch, length, n_heads, d_head)
A: (batch, length, n_heads)
B: (batch, length, n_heads, d_state)
C: (batch, length, n_heads, d_state)
Return:
Y: (batch, length, n_heads, d_head)
"""
assert
X
.
dtype
==
A
.
dtype
==
B
.
dtype
==
C
.
dtype
assert
X
.
shape
[
1
]
%
block_len
==
0
# Rearrange into blocks/chunks
X
,
A
,
B
,
C
=
(
rearrange
(
x
,
"b (c l) ... -> b c l ..."
,
l
=
block_len
)
for
x
in
(
X
,
A
,
B
,
C
))
A
=
rearrange
(
A
,
"b c l h -> b h c l"
)
A_cumsum
=
torch
.
cumsum
(
A
,
dim
=-
1
)
# 1. Compute the output for each intra-chunk (diagonal blocks)
L
=
torch
.
exp
(
segsum
(
A
))
Y_diag
=
torch
.
einsum
(
"bclhn,bcshn,bhcls,bcshp->bclhp"
,
C
,
B
,
L
,
X
)
# 2. Compute the state for each intra-chunk
# (right term of low-rank factorization of off-diagonal blocks; B terms)
decay_states
=
torch
.
exp
(
A_cumsum
[:,
:,
:,
-
1
:]
-
A_cumsum
)
states
=
torch
.
einsum
(
"bclhn,bhcl,bclhp->bchpn"
,
B
,
decay_states
,
X
)
# 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at
# chunk boundaries
# (middle term of factorization of off-diag blocks; A terms)
if
initial_states
is
None
:
initial_states
=
torch
.
zeros_like
(
states
[:,
:
1
])
states
=
torch
.
cat
([
initial_states
,
states
],
dim
=
1
)
decay_chunk
=
torch
.
exp
(
segsum
(
F
.
pad
(
A_cumsum
[:,
:,
:,
-
1
],
(
1
,
0
))))
new_states
=
torch
.
einsum
(
"bhzc,bchpn->bzhpn"
,
decay_chunk
,
states
)
states
,
final_state
=
new_states
[:,
:
-
1
],
new_states
[:,
-
1
]
# 4. Compute state -> output conversion per chunk
# (left term of low-rank factorization of off-diagonal blocks; C terms)
state_decay_out
=
torch
.
exp
(
A_cumsum
)
Y_off
=
torch
.
einsum
(
'bclhn,bchpn,bhcl->bclhp'
,
C
,
states
,
state_decay_out
)
# Add output of intra-chunk and inter-chunk terms
# (diagonal and off-diagonal blocks)
Y
=
rearrange
(
Y_diag
+
Y_off
,
"b c l h p -> b (c l) h p"
)
return
Y
,
final_state
def
generate_random_inputs
(
batch_size
,
seqlen
,
n_heads
,
d_head
,
itype
,
device
=
'cuda'
):
current_platform
.
seed_everything
(
0
)
A
=
(
-
torch
.
exp
(
torch
.
rand
(
n_heads
,
dtype
=
itype
,
device
=
device
)))
dt
=
F
.
softplus
(
torch
.
randn
(
batch_size
,
seqlen
,
n_heads
,
dtype
=
itype
,
device
=
device
)
-
4
)
X
=
torch
.
randn
((
batch_size
,
seqlen
,
n_heads
,
d_head
),
dtype
=
itype
,
device
=
device
)
B
=
torch
.
randn
((
batch_size
,
seqlen
,
n_heads
,
d_head
),
dtype
=
itype
,
device
=
device
)
C
=
torch
.
randn
((
batch_size
,
seqlen
,
n_heads
,
d_head
),
dtype
=
itype
,
device
=
device
)
return
A
,
dt
,
X
,
B
,
C
def
generate_continous_batched_examples
(
example_lens_by_batch
,
num_examples
,
full_length
,
last_taken
,
exhausted
,
n_heads
,
d_head
,
itype
,
device
=
'cuda'
):
# this function generates a random examples of certain length
# and then cut according to "example_lens_by_batch" and feed
# them in continuous batches to the kernels
# generate the full-length example
A
,
dt
,
X
,
B
,
C
=
generate_random_inputs
(
num_examples
,
full_length
,
n_heads
,
d_head
,
itype
)
Y_min
,
final_state_min
=
ssd_minimal_discrete
(
X
*
dt
.
unsqueeze
(
-
1
),
A
*
dt
,
B
,
C
,
block_len
=
full_length
//
4
)
# internal function that outputs a cont batch of examples
# given a tuple of lengths for each example in the batch
# e.g., example_lens=(8, 4) means take 8 samples from first eg,
# 4 examples from second eg, etc
def
get_continuous_batch
(
example_lens
:
Tuple
[
int
,
...]):
indices
=
[]
for
i
,
x
in
enumerate
(
example_lens
):
c
=
last_taken
.
get
(
i
,
0
)
indices
.
append
((
c
,
c
+
x
))
last_taken
[
i
]
=
(
c
+
x
)
%
full_length
exhausted
[
i
]
=
last_taken
[
i
]
==
0
return
(
torch
.
concat
([
x
[
i
,
s
:
e
]
for
i
,
(
s
,
e
)
in
enumerate
(
indices
)
]).
unsqueeze
(
0
)
for
x
in
(
dt
,
X
,
B
,
C
))
# internal function that maps "n" to the appropriate right boundary
# value when forming continuous batches from examples of length given
# by "full_length".
# - e.g., when n > full_length, returns n % full_length
# when n == full_length, returns full_length
def
end_boundary
(
n
:
int
):
return
n
-
((
n
-
1
)
//
full_length
)
*
full_length
IND_E
=
None
for
spec
in
example_lens_by_batch
:
# get the (maybe partial) example seen in this cont batch
dt2
,
X2
,
B2
,
C2
=
get_continuous_batch
(
spec
)
# get the metadata
cu_seqlens
=
torch
.
tensor
((
0
,
)
+
spec
,
device
=
device
).
cumsum
(
dim
=
0
)
sed_idx
=
torch
.
zeros
(
cu_seqlens
[
-
1
],
dtype
=
torch
.
int32
,
device
=
cu_seqlens
.
device
)
for
i
,
(
srt
,
end
)
in
enumerate
(
zip
(
cu_seqlens
,
cu_seqlens
[
1
:],
)):
sed_idx
[
srt
:
end
]
=
i
# for cont batch
if
IND_E
is
None
:
IND_S
=
[
0
for
_
in
range
(
len
(
spec
))]
else
:
IND_S
=
[
x
%
full_length
for
x
in
IND_E
]
IND_E
=
[
end_boundary
(
x
+
y
)
for
x
,
y
in
zip
(
IND_S
,
spec
)]
yield
([
Y_min
[
s
,
IND_S
[
s
]:
IND_E
[
s
]]
for
s
in
range
(
num_examples
)],
cu_seqlens
,
sed_idx
.
unsqueeze
(
0
),
(
A
,
dt2
,
X2
,
B2
,
C2
))
@
pytest
.
mark
.
parametrize
(
"itype"
,
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"n_heads"
,
[
3
,
4
,
11
,
16
,
32
])
@
pytest
.
mark
.
parametrize
(
"d_head"
,
[
5
,
8
,
19
,
32
,
128
])
@
pytest
.
mark
.
parametrize
(
"seq_len_chunk_size"
,
[(
119
,
17
),
(
128
,
32
)])
def
test_mamba_chunk_scan_single_example
(
d_head
,
n_heads
,
seq_len_chunk_size
,
itype
):
# this tests the kernels on a single example (no batching)
# set seed
batch_size
=
1
# batch_size
# ssd_minimal_discrete requires chunk_size divide seqlen
# - this is only required for generating the reference seqs,
# it is not an operational limitation.
seqlen
,
chunk_size
=
seq_len_chunk_size
A
,
dt
,
X
,
B
,
C
=
generate_random_inputs
(
batch_size
,
seqlen
,
n_heads
,
d_head
,
itype
)
Y_min
,
final_state_min
=
ssd_minimal_discrete
(
X
*
dt
.
unsqueeze
(
-
1
),
A
*
dt
,
B
,
C
,
chunk_size
)
Y
,
final_state
=
mamba_chunk_scan_combined
(
X
,
dt
,
A
,
B
,
C
,
chunk_size
,
D
=
None
,
return_final_states
=
True
)
# just test the last in sequence
torch
.
allclose
(
Y
[:,
-
1
],
Y_min
[:,
-
1
],
atol
=
1e-3
,
rtol
=
1e-3
)
# just test the last head
# NOTE, in the kernel we always cast states to fp32
torch
.
allclose
(
final_state
[:,
-
1
],
final_state_min
[:,
-
1
].
to
(
torch
.
float32
),
atol
=
1e-3
,
rtol
=
1e-3
)
@
pytest
.
mark
.
parametrize
(
"itype"
,
[
torch
.
float32
,
torch
.
float16
])
@
pytest
.
mark
.
parametrize
(
"n_heads"
,
[
4
,
8
,
13
])
@
pytest
.
mark
.
parametrize
(
"d_head"
,
[
5
,
16
,
21
,
32
])
@
pytest
.
mark
.
parametrize
(
"seq_len_chunk_size_cases"
,
[
# small-ish chunk_size (8)
(
64
,
8
,
2
,
[(
64
,
32
),
(
64
,
32
)]),
(
64
,
8
,
2
,
[(
32
,
32
),
(
32
,
32
),
(
32
,
32
)]),
(
64
,
8
,
2
,
[(
8
,
8
),
(
8
,
8
),
(
8
,
8
)]),
# chunk size boundary
(
64
,
8
,
2
,
[(
4
,
4
),
(
4
,
4
),
(
4
,
4
),
(
4
,
4
)]),
# chunk_size larger than cont batches
(
64
,
8
,
5
,
[
(
64
,
32
,
16
,
8
,
8
),
(
8
,
16
,
32
,
16
,
8
),
(
8
,
8
,
16
,
32
,
16
),
]),
# mode examples with varied lengths
# odd chunk_size
(
64
,
29
,
2
,
[(
11
,
4
),
(
13
,
23
),
(
19
,
22
),
(
21
,
15
)]),
# irregular sizes
# large-ish chunk_size (256)
(
64
,
256
,
1
,
[(
5
,
),
(
1
,
),
(
1
,
),
(
1
,
)]),
# irregular sizes with small sequences
(
64
,
256
,
2
,
[(
5
,
30
),
(
1
,
2
),
(
1
,
2
),
(
1
,
2
)]),
# irregular sizes with small sequences
])
def
test_mamba_chunk_scan_cont_batch
(
d_head
,
n_heads
,
seq_len_chunk_size_cases
,
itype
):
# this test with multiple examples in a continuous batch
# (i.e. chunked prefill)
seqlen
,
chunk_size
,
num_examples
,
cases
=
seq_len_chunk_size_cases
# hold state during the cutting process so we know if an
# example has been exhausted and needs to cycle
last_taken
:
Dict
=
{}
# map: eg -> pointer to last taken sample
exhausted
:
Dict
=
{}
# map: eg -> boolean indicating example is exhausted
states
=
None
for
Y_min
,
cu_seqlens
,
sed_idx
,
(
A
,
dt
,
X
,
B
,
C
)
in
generate_continous_batched_examples
(
cases
,
num_examples
,
seqlen
,
last_taken
,
exhausted
,
n_heads
,
d_head
,
itype
):
Y
,
new_states
=
mamba_chunk_scan_combined
(
X
,
dt
,
A
,
B
,
C
,
chunk_size
,
D
=
None
,
cu_seqlens
=
cu_seqlens
,
seq_idx
=
sed_idx
,
return_varlen_states
=
True
,
initial_states
=
states
,
)
# just test the last in sequence
for
i
in
range
(
num_examples
):
# just test one dim and dstate
Y_eg
=
Y
[
0
,
cu_seqlens
[
i
]:
cu_seqlens
[
i
+
1
],
0
,
0
]
Y_min_eg
=
Y_min
[
i
][:,
0
,
0
]
torch
.
allclose
(
Y_eg
,
Y_min_eg
,
atol
=
1e-3
,
rtol
=
1e-3
)
# update states
states
=
new_states
for
i
,
clear
in
exhausted
.
items
():
if
clear
:
states
[
i
].
fill_
(
0.
)
exhausted
[
i
]
=
False
tests/kernels/test_nvfp4_quant.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
scalar_types
if
not
current_platform
.
has_device_capability
(
100
):
pytest
.
skip
(
reason
=
"Nvfp4 Requires compute capability of 10 or above."
,
allow_module_level
=
True
)
DTYPES
=
[
torch
.
float16
,
torch
.
bfloat16
]
SHAPES
=
[(
128
,
64
),
(
128
,
128
),
(
256
,
64
),
(
256
,
128
)]
PAD_SHAPES
=
[(
90
,
64
),
(
150
,
64
),
(
128
,
48
),
(
128
,
80
),
(
150
,
80
),
(
90
,
48
),
(
90
,
128
),
(
150
,
128
),
(
150
,
48
),
(
90
,
80
)]
SEEDS
=
[
42
]
CUDA_DEVICES
=
[
'cuda:0'
]
FLOAT4_E2M1_MAX
=
scalar_types
.
float4_e2m1fn
.
max
()
FLOAT8_E4M3_MAX
=
torch
.
finfo
(
torch
.
float8_e4m3fn
).
max
# E2M1 to float
# 0111 -> 6
# 0110 -> 4
# 0101 -> 3
# 0100 -> 2
# 0011 -> 1.5
# 0010 -> 1
# 0001 -> 0.5
# 0000 -> 0
E2M1_TO_FLOAT32
=
[
0.
,
0.5
,
1.
,
1.5
,
2.
,
3.
,
4.
,
6.
,
0.
,
-
0.5
,
-
1.
,
-
1.5
,
-
2.
,
-
3.
,
-
4.
,
-
6.
]
BLOCK_SIZE
=
16
def
cast_from_fp4
(
x
,
m
,
n
):
# The fp4 values are packed in uint8 as [v_1st | v_2nd]
v_2nd
=
x
&
0xF
v_1st
=
(
x
>>
4
)
&
0xF
c
=
torch
.
stack
((
v_2nd
,
v_1st
),
dim
=-
1
)
out
=
torch
.
tensor
([
E2M1_TO_FLOAT32
[
x
]
for
x
in
c
.
flatten
()])
out
=
out
.
reshape
(
m
,
n
).
to
(
torch
.
float32
)
return
out
def
cast_to_fp4
(
x
):
sign
=
torch
.
sign
(
x
)
x
=
torch
.
abs
(
x
)
x
[(
x
>=
0.0
)
&
(
x
<=
0.25
)]
=
0.0
x
[(
x
>
0.25
)
&
(
x
<
0.75
)]
=
0.5
x
[(
x
>=
0.75
)
&
(
x
<=
1.25
)]
=
1.0
x
[(
x
>
1.25
)
&
(
x
<
1.75
)]
=
1.5
x
[(
x
>=
1.75
)
&
(
x
<=
2.5
)]
=
2.0
x
[(
x
>
2.5
)
&
(
x
<
3.5
)]
=
3.0
x
[(
x
>=
3.5
)
&
(
x
<=
5.0
)]
=
4.0
x
[
x
>
5.0
]
=
6.0
return
x
*
sign
def
get_reciprocal
(
x
):
if
isinstance
(
x
,
torch
.
Tensor
):
return
torch
.
where
(
x
==
0
,
torch
.
tensor
(
0.0
,
dtype
=
x
.
dtype
),
1.0
/
x
)
elif
isinstance
(
x
,
(
float
,
int
)):
return
0.0
if
x
==
0
else
1.0
/
x
else
:
raise
TypeError
(
"Input must be a float, int, or a torch.Tensor."
)
def
ref_nvfp4_quant
(
x
,
global_scale
):
assert
global_scale
.
dtype
==
torch
.
float32
assert
x
.
ndim
==
2
m
,
n
=
x
.
shape
x
=
torch
.
reshape
(
x
,
(
m
,
n
//
BLOCK_SIZE
,
BLOCK_SIZE
))
vec_max
=
torch
.
max
(
torch
.
abs
(
x
),
dim
=-
1
,
keepdim
=
True
)[
0
].
to
(
torch
.
float32
)
scale
=
global_scale
*
(
vec_max
*
get_reciprocal
(
FLOAT4_E2M1_MAX
))
scale
=
scale
.
to
(
torch
.
float8_e4m3fn
).
to
(
torch
.
float32
)
output_scale
=
get_reciprocal
(
scale
*
get_reciprocal
(
global_scale
))
scaled_x
=
x
.
to
(
torch
.
float32
)
*
output_scale
clipped_x
=
torch
.
clamp
(
scaled_x
,
-
6.0
,
6.0
).
reshape
(
m
,
n
)
return
cast_to_fp4
(
clipped_x
),
scale
.
squeeze
(
-
1
)
def
recover_swizzled_scales
(
scale
,
m
,
n
):
round_up
=
lambda
x
,
y
:
(
x
+
y
-
1
)
//
y
*
y
rounded_m
=
round_up
(
m
,
128
)
scale_n
=
n
//
BLOCK_SIZE
rounded_n
=
round_up
(
scale_n
,
4
)
# Recover the swizzled scaling factor to linear layout
tmp
=
torch
.
reshape
(
scale
,
(
1
,
rounded_m
//
128
,
rounded_n
//
4
,
32
,
4
,
4
))
tmp
=
torch
.
permute
(
tmp
,
(
0
,
1
,
4
,
3
,
2
,
5
))
result
=
torch
.
reshape
(
tmp
,
(
rounded_m
,
rounded_n
)).
to
(
torch
.
float32
)
return
result
[:
m
,
:
scale_n
]
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"shape"
,
SHAPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_quantize_to_fp4
(
dtype
:
torch
.
dtype
,
shape
:
tuple
[
int
,
int
],
seed
:
int
,
device
:
str
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
m
,
n
=
shape
x
=
torch
.
randn
((
m
,
n
),
dtype
=
dtype
)
tensor_amax
=
torch
.
abs
(
x
).
max
().
to
(
torch
.
float32
)
global_scale
=
FLOAT8_E4M3_MAX
*
FLOAT4_E2M1_MAX
/
tensor_amax
out_ref
,
scale_ref
=
ref_nvfp4_quant
(
x
,
global_scale
)
out
,
out_scale
=
ops
.
scaled_fp4_quant
(
x
,
global_scale
)
scale_ans
=
recover_swizzled_scales
(
out_scale
,
m
,
n
)
out_ans
=
cast_from_fp4
(
out
,
m
,
n
)
torch
.
testing
.
assert_close
(
out_ans
,
out_ref
)
torch
.
testing
.
assert_close
(
scale_ans
,
scale_ref
)
@
pytest
.
mark
.
parametrize
(
"pad_shape"
,
PAD_SHAPES
)
@
torch
.
inference_mode
()
def
test_quantize_to_fp4_padded
(
pad_shape
:
tuple
[
int
,
int
])
->
None
:
dtype
=
torch
.
float16
current_platform
.
seed_everything
(
42
)
torch
.
set_default_device
(
'cuda:0'
)
m
,
n
=
pad_shape
x
=
torch
.
randn
((
m
,
n
),
dtype
=
dtype
)
tensor_amax
=
torch
.
abs
(
x
).
max
().
to
(
torch
.
float32
)
global_scale
=
FLOAT8_E4M3_MAX
*
FLOAT4_E2M1_MAX
/
tensor_amax
out_ref
,
scale_ref
=
ref_nvfp4_quant
(
x
,
global_scale
)
out
,
out_scale
=
ops
.
scaled_fp4_quant
(
x
,
global_scale
)
scale_ans
=
recover_swizzled_scales
(
out_scale
,
m
,
n
)
out_ans
=
cast_from_fp4
(
out
,
m
,
n
)
torch
.
testing
.
assert_close
(
out_ans
,
out_ref
)
torch
.
testing
.
assert_close
(
scale_ans
,
scale_ref
)
Prev
1
…
5
6
7
8
9
10
11
12
13
…
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment