Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e661d594
Commit
e661d594
authored
Aug 12, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.4' into v0.5.4-dtk24.04.1
parents
6b16ea2e
4db5176d
Changes
374
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2015 additions
and
644 deletions
+2015
-644
tests/entrypoints/openai/test_disable_mp.py
tests/entrypoints/openai/test_disable_mp.py
+715
-0
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+0
-1
tests/entrypoints/openai/test_oot_registration.py
tests/entrypoints/openai/test_oot_registration.py
+4
-0
tests/entrypoints/openai/test_return_tokens_as_ids.py
tests/entrypoints/openai/test_return_tokens_as_ids.py
+83
-0
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+39
-0
tests/kernels/test_attention.py
tests/kernels/test_attention.py
+3
-1
tests/kernels/test_cache.py
tests/kernels/test_cache.py
+41
-9
tests/kernels/test_cutlass.py
tests/kernels/test_cutlass.py
+4
-4
tests/kernels/test_flash_attn.py
tests/kernels/test_flash_attn.py
+13
-6
tests/kernels/test_int8_quant.py
tests/kernels/test_int8_quant.py
+0
-2
tests/kernels/test_marlin_gemm.py
tests/kernels/test_marlin_gemm.py
+117
-37
tests/kernels/test_pos_encoding.py
tests/kernels/test_pos_encoding.py
+1
-1
tests/kernels/test_sampler.py
tests/kernels/test_sampler.py
+33
-20
tests/lora/test_gemma.py
tests/lora/test_gemma.py
+1
-1
tests/lora/test_layers.py
tests/lora/test_layers.py
+184
-59
tests/lora/test_lora.py
tests/lora/test_lora.py
+0
-224
tests/lora/test_punica.py
tests/lora/test_punica.py
+0
-258
tests/lora/test_punica_sizes.py
tests/lora/test_punica_sizes.py
+408
-0
tests/lora/test_punica_variation.py
tests/lora/test_punica_variation.py
+342
-0
tests/lora/test_quant_model.py
tests/lora/test_quant_model.py
+27
-21
No files found.
tests/entrypoints/openai/test_disable_mp.py
0 → 100644
View file @
e661d594
"""
Repeat of tests in test_completion.py with the non-mp backend.
"""
# imports for guided decoding tests
import
json
import
re
import
shutil
from
tempfile
import
TemporaryDirectory
from
typing
import
List
import
jsonschema
import
openai
# use the official client for correctness check
import
pytest
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
transformers
import
AutoTokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically these adapters use a different base model,
# but we're not testing generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
PA_NAME
=
"swapnilbp/llama_tweet_ptune"
# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
# need to change to match the prompt adapter
PA_NUM_VIRTUAL_TOKENS
=
8
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_added_tokens_files
(
zephyr_lora_files
):
tmp_dir
=
TemporaryDirectory
()
tmp_model_dir
=
f
"
{
tmp_dir
.
name
}
/zephyr"
shutil
.
copytree
(
zephyr_lora_files
,
tmp_model_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added
=
tokenizer
.
add_tokens
([
"vllm1"
,
"vllm2"
,
"vllm3"
],
special_tokens
=
True
)
assert
added
==
3
tokenizer
.
save_pretrained
(
tmp_model_dir
)
yield
tmp_model_dir
tmp_dir
.
cleanup
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_pa_files
():
return
snapshot_download
(
repo_id
=
PA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
(
zephyr_lora_files
,
zephyr_lora_added_tokens_files
,
zephyr_pa_files
):
return
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--max-num-seqs"
,
"128"
,
"--enforce-eager"
,
# lora config
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_added_tokens_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
# pa config
"--enable-prompt-adapter"
,
"--prompt-adapters"
,
f
"zephyr-pa=
{
zephyr_pa_files
}
"
,
f
"zephyr-pa2=
{
zephyr_pa_files
}
"
,
"--max-prompt-adapters"
,
"2"
,
"--max-prompt-adapter-token"
,
"128"
,
"--disable-frontend-multiprocessing"
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
default_server_args
):
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras, then test prompt adapters
"model_name,num_virtual_tokens"
,
[(
MODEL_NAME
,
0
),
(
"zephyr-lora"
,
0
),
(
"zephyr-lora2"
,
0
),
(
"zephyr-pa"
,
PA_NUM_VIRTUAL_TOKENS
),
(
"zephyr-pa2"
,
PA_NUM_VIRTUAL_TOKENS
)],
)
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
num_virtual_tokens
:
int
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
+
num_virtual_tokens
,
total_tokens
=
11
+
num_virtual_tokens
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
1
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
"zephyr-lora2"
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should appear in tokenized prompt
assert
completion
.
choices
[
0
].
text
.
startswith
(
"<unk><unk>vllm1vllm2vllm3"
)
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens_base_model
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should not appear in tokenized prompt
assert
"vllm"
not
in
completion
.
choices
[
0
].
text
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras, then test prompt adapters
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
,
"zephyr-pa"
,
"zephyr-pa2"
],
)
async
def
test_no_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
None
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora and 1 pa hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_zero_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
0
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
==
1
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_some_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
5
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
5
<=
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<=
6
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_too_many_completion_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
21
,
)
...
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
30
,
stream
=
True
,
)
async
for
chunk
in
stream
:
...
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_completion_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is an LLM?"
single_completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
)
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
text
assert
""
.
join
(
chunks
)
==
single_output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is the capital of France?"
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
,
"continuous_usage_stats"
:
False
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
,
"continuous_usage_stats"
:
True
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
False
,
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
not
None
assert
chunk
.
usage
.
prompt_tokens
>
0
assert
chunk
.
usage
.
completion_tokens
>
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options=
# {"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options=
# {"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
# Test stream=False, stream_options=
# {"continuous_usage_stats": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"continuous_usage_stats"
:
None
})
# Test stream=False, stream_options=
# {"continuous_usage_stats": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"continuous_usage_stats"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_batch_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test both text and token IDs
for
prompts
in
([
"Hello, my name is"
]
*
2
,
[[
0
,
0
,
0
,
0
,
0
]]
*
2
):
# test simple list
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
batch
.
choices
)
==
2
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
1
].
text
# test n = 2
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
n
=
2
,
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
assert
batch
.
choices
[
0
].
text
!=
batch
.
choices
[
1
].
text
,
"beam search should be different"
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
2
].
text
,
"two copies of the same prompt should be the same"
assert
batch
.
choices
[
1
].
text
==
batch
.
choices
[
3
].
text
,
"two copies of the same prompt should be the same"
# test streaming
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
texts
=
[
""
]
*
2
async
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
assert
texts
[
0
]
==
texts
[
1
]
@
pytest
.
mark
.
asyncio
async
def
test_logits_bias
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
5
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
token_id
=
1000
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token_id
):
100
},
seed
=
42
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
expected_tokens
=
tokenizer
(
tokenizer
.
decode
([
token_id
]
*
5
),
add_special_tokens
=
False
)[
"input_ids"
]
assert
all
([
response
==
expected
for
response
,
expected
in
zip
(
response_tokens
,
expected_tokens
)
])
# Test ban
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
)
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
first_response
=
completion
.
choices
[
0
].
text
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token
):
-
100
for
token
in
response_tokens
},
)
assert
first_response
!=
completion
.
choices
[
0
].
text
@
pytest
.
mark
.
asyncio
async
def
test_allowed_token_ids
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
1
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
allowed_ids
=
[
21555
,
21557
,
21558
]
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
seed
=
42
,
extra_body
=
dict
(
allowed_token_ids
=
allowed_ids
),
logprobs
=
1
,
)
response_tokens
=
completion
.
choices
[
0
].
logprobs
.
tokens
assert
len
(
response_tokens
)
==
1
assert
tokenizer
.
convert_tokens_to_ids
(
response_tokens
)[
0
]
in
allowed_ids
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
output_json
=
json
.
loads
(
completion
.
choices
[
i
].
text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_regex
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
assert
re
.
fullmatch
(
sample_regex
,
completion
.
choices
[
i
].
text
)
is
not
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_guided_choice
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"The best language for type-safe systems programming is "
,
n
=
2
,
temperature
=
1.0
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
2
for
i
in
range
(
2
):
assert
completion
.
choices
[
i
].
text
in
sample_guided_choice
@
pytest
.
mark
.
asyncio
async
def
test_guided_grammar
(
client
:
openai
.
AsyncOpenAI
,
sample_sql_statements
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
(
"Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"
),
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_grammar
=
sample_sql_statements
))
content
=
completion
.
choices
[
0
].
text
# use Lark to parse the output, and make sure it's a valid parse tree
from
lark
import
Lark
parser
=
Lark
(
sample_sql_statements
)
parser
.
parse
(
content
)
# remove spaces for comparison b/c we removed them in the grammar
ground_truth
=
"SELECT col_1 from table_1 where col_1 = 1"
.
replace
(
" "
,
""
)
assert
content
.
strip
()
==
ground_truth
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
@
pytest
.
mark
.
parametrize
(
"logprobs_arg"
,
[
1
,
0
])
async
def
test_echo_logprob_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
logprobs_arg
:
int
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# test using text and token IDs
for
prompt
in
(
"Hello, my name is"
,
[
0
,
0
,
0
,
0
,
0
]):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
echo
=
True
,
logprobs
=
logprobs_arg
)
prompt_text
=
tokenizer
.
decode
(
prompt
)
if
isinstance
(
prompt
,
list
)
else
prompt
assert
re
.
search
(
r
"^"
+
prompt_text
,
completion
.
choices
[
0
].
text
)
logprobs
=
completion
.
choices
[
0
].
logprobs
assert
logprobs
is
not
None
assert
len
(
logprobs
.
text_offset
)
>
5
assert
(
len
(
logprobs
.
token_logprobs
)
>
5
and
logprobs
.
token_logprobs
[
0
]
is
None
)
assert
(
len
(
logprobs
.
top_logprobs
)
>
5
and
logprobs
.
top_logprobs
[
0
]
is
None
)
for
top_logprobs
in
logprobs
.
top_logprobs
[
1
:]:
assert
max
(
logprobs_arg
,
1
)
<=
len
(
top_logprobs
)
<=
logprobs_arg
+
1
assert
len
(
logprobs
.
tokens
)
>
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_decoding_type_error
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
,
sample_regex
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example JSON that fits this schema: 42"
,
extra_body
=
dict
(
guided_json
=
42
,
guided_decoding_backend
=
guided_decoding_backend
))
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example string that fits this regex"
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_json
=
sample_json_schema
))
tests/entrypoints/openai/test_embedding.py
View file @
e661d594
...
...
@@ -18,7 +18,6 @@ def embedding_server():
"--enforce-eager"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
]
with
RemoteOpenAIServer
(
EMBEDDING_MODEL_NAME
,
args
)
as
remote_server
:
...
...
tests/entrypoints/openai/test_oot_registration.py
View file @
e661d594
...
...
@@ -36,10 +36,12 @@ def test_oot_registration_for_api_server():
ctx
=
torch
.
multiprocessing
.
get_context
()
server
=
ctx
.
Process
(
target
=
server_function
,
args
=
(
port
,
))
server
.
start
()
MAX_SERVER_START_WAIT_S
=
60
client
=
OpenAI
(
base_url
=
f
"http://localhost:
{
port
}
/v1"
,
api_key
=
"token-abc123"
,
)
now
=
time
.
time
()
while
True
:
try
:
completion
=
client
.
chat
.
completions
.
create
(
...
...
@@ -57,6 +59,8 @@ def test_oot_registration_for_api_server():
except
OpenAIError
as
e
:
if
"Connection error"
in
str
(
e
):
time
.
sleep
(
3
)
if
time
.
time
()
-
now
>
MAX_SERVER_START_WAIT_S
:
raise
RuntimeError
(
"Server did not start in time"
)
from
e
else
:
raise
e
server
.
kill
()
...
...
tests/entrypoints/openai/test_return_tokens_as_ids.py
0 → 100644
View file @
e661d594
# Separate these tests out from test_completion and test_chat, because they
# require launching a second server with a different flag. Running both servers
# at the same time on a single node will OOM.
import
pytest
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
from
.test_completion
import
default_server_args
# noqa: F401
from
.test_completion
import
zephyr_lora_added_tokens_files
# noqa: F401
from
.test_completion
import
zephyr_lora_files
# noqa: F401
from
.test_completion
import
zephyr_pa_files
# noqa: F401
from
.test_completion
import
MODEL_NAME
@
pytest
.
fixture
(
scope
=
"module"
)
def
server_with_return_tokens_as_token_ids_flag
(
default_server_args
):
# noqa: F811
args_with_flag
=
default_server_args
+
[
"--return-tokens-as-token-ids"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args_with_flag
)
as
remote_server
:
yield
remote_server
@
pytest
.
mark
.
asyncio
async
def
test_completion_return_tokens_as_token_ids_completion
(
server_with_return_tokens_as_token_ids_flag
):
client
=
server_with_return_tokens_as_token_ids_flag
.
get_async_client
()
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
# Include Unicode characters to test for dividing a single
# character across multiple tokens: 🎉 is [28705, 31862] for the
# Zephyr tokenizer
prompt
=
"Say 'Hello, world! 🎉'"
,
echo
=
True
,
temperature
=
0
,
max_tokens
=
10
,
logprobs
=
1
)
text
=
completion
.
choices
[
0
].
text
token_strs
=
completion
.
choices
[
0
].
logprobs
.
tokens
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Check that the token representations are consistent between raw tokens
# and top_logprobs
# Slice off the first one, because there's no scoring associated with BOS
top_logprobs
=
completion
.
choices
[
0
].
logprobs
.
top_logprobs
[
1
:]
top_logprob_keys
=
[
next
(
iter
(
logprob_by_tokens
))
for
logprob_by_tokens
in
top_logprobs
]
assert
token_strs
[
1
:]
==
top_logprob_keys
# Check that decoding the tokens gives the expected text
tokens
=
[
int
(
token
.
removeprefix
(
"token_id:"
))
for
token
in
token_strs
]
assert
text
==
tokenizer
.
decode
(
tokens
,
skip_special_tokens
=
True
)
@
pytest
.
mark
.
asyncio
async
def
test_chat_return_tokens_as_token_ids_completion
(
server_with_return_tokens_as_token_ids_flag
):
client
=
server_with_return_tokens_as_token_ids_flag
.
get_async_client
()
response
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
# Include Unicode characters to test for dividing a single
# character across multiple tokens: 🎉 is [28705, 31862] for the
# Zephyr tokenizer
messages
=
[{
"role"
:
"system"
,
"content"
:
"You like to respond in only emojis, like 🎉"
},
{
"role"
:
"user"
,
"content"
:
"Please write some emojis: 🐱🐶🎉"
}],
temperature
=
0
,
max_tokens
=
8
,
logprobs
=
True
)
text
=
response
.
choices
[
0
].
message
.
content
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
token_ids
=
[]
for
logprob_content
in
response
.
choices
[
0
].
logprobs
.
content
:
token_ids
.
append
(
int
(
logprob_content
.
token
.
removeprefix
(
"token_id:"
)))
assert
tokenizer
.
decode
(
token_ids
,
skip_special_tokens
=
True
)
==
text
tests/entrypoints/openai/test_serving_chat.py
View file @
e661d594
import
asyncio
from
contextlib
import
suppress
from
dataclasses
import
dataclass
from
unittest.mock
import
MagicMock
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
MODEL_NAME
=
"openai-community/gpt2"
CHAT_TEMPLATE
=
"Dummy chat template for testing {}"
...
...
@@ -42,3 +47,37 @@ async def _async_serving_chat_init():
def
test_async_serving_chat_init
():
serving_completion
=
asyncio
.
run
(
_async_serving_chat_init
())
assert
serving_completion
.
chat_template
==
CHAT_TEMPLATE
def
test_serving_chat_should_set_correct_max_tokens
():
mock_engine
=
MagicMock
(
spec
=
AsyncLLMEngine
)
mock_engine
.
get_tokenizer
.
return_value
=
get_tokenizer
(
MODEL_NAME
)
serving_chat
=
OpenAIServingChat
(
mock_engine
,
MockModelConfig
(),
served_model_names
=
[
MODEL_NAME
],
response_role
=
"assistant"
,
chat_template
=
CHAT_TEMPLATE
,
lora_modules
=
None
,
prompt_adapters
=
None
,
request_logger
=
None
)
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}],
guided_decoding_backend
=
"outlines"
,
)
with
suppress
(
Exception
):
asyncio
.
run
(
serving_chat
.
create_chat_completion
(
req
))
# AsyncLLMEngine.generate(inputs, sampling_params, ...)
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
93
req
.
max_tokens
=
10
with
suppress
(
Exception
):
asyncio
.
run
(
serving_chat
.
create_chat_completion
(
req
))
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
max_tokens
==
10
tests/kernels/test_attention.py
View file @
e661d594
...
...
@@ -29,7 +29,7 @@ NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing
# FlashAttention forward only supports head dimension at most 128
# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
192
,
256
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
120
,
128
,
192
,
256
]
if
not
is_hip
()
else
[
64
,
80
,
96
,
112
,
128
]
BLOCK_SIZES
=
[
16
,
32
]
...
...
@@ -135,6 +135,8 @@ def test_paged_attention(
seed
:
int
,
device
:
str
,
)
->
None
:
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
pytest
.
skip
()
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
...
...
tests/kernels/test_cache.py
View file @
e661d594
...
...
@@ -11,7 +11,7 @@ DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS
=
[
42
]
# Arbitrary values for testing
NUM_LAYERS
=
[
1
]
# Arbitrary values for testing
NUM_HEADS
=
[
8
]
# Arbitrary values for testing
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
192
,
256
]
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
120
,
128
,
192
,
256
]
BLOCK_SIZES
=
[
8
,
16
,
32
]
# Arbitrary values for testing
...
...
@@ -53,6 +53,8 @@ def test_copy_blocks(
kv_cache_dtype
:
str
,
device
:
str
,
)
->
None
:
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
pytest
.
skip
()
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
...
...
@@ -125,6 +127,8 @@ def test_reshape_and_cache(
device
:
str
,
kv_cache_dtype
:
str
,
)
->
None
:
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
pytest
.
skip
()
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
...
...
@@ -216,8 +220,6 @@ def test_reshape_and_cache_flash(
device
:
str
,
kv_cache_dtype
:
str
,
)
->
None
:
if
kv_cache_dtype
==
"fp8"
:
pytest
.
skip
()
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
...
...
@@ -249,15 +251,33 @@ def test_reshape_and_cache_flash(
dtype
,
device
=
device
,
)
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
key_cache
,
value_cache
=
key_caches
[
0
].
contiguous
(
),
value_caches
[
0
].
contiguous
()
del
key_caches
del
value_caches
# Clone the KV caches.
cloned_key_cache
=
key_cache
.
clone
()
cloned_value_cache
=
value_cache
.
clone
()
if
kv_cache_dtype
==
"fp8"
:
cloned_key_cache
=
torch
.
empty_like
(
key_cache
,
dtype
=
torch
.
float16
)
ops
.
convert_fp8
(
cloned_key_cache
,
key_cache
)
cloned_value_cache
=
torch
.
empty_like
(
value_cache
,
dtype
=
torch
.
float16
)
ops
.
convert_fp8
(
cloned_value_cache
,
value_cache
)
else
:
cloned_key_cache
=
key_cache
.
clone
()
cloned_value_cache
=
value_cache
.
clone
()
# Using default kv_scale
k_scale
=
v_scale
=
1.0
# Call the reshape_and_cache kernel.
ops
.
reshape_and_cache_flash
(
key
,
value
,
key_cache
,
value_cache
,
slot_mapping
,
kv_cache_dtype
)
slot_mapping
,
kv_cache_dtype
,
k_scale
,
v_scale
)
if
kv_cache_dtype
==
"fp8"
:
result_key_cache
=
torch
.
empty_like
(
key_cache
,
dtype
=
torch
.
float16
)
ops
.
convert_fp8
(
result_key_cache
,
key_cache
)
result_value_cache
=
torch
.
empty_like
(
value_cache
,
dtype
=
torch
.
float16
)
ops
.
convert_fp8
(
result_value_cache
,
value_cache
)
# Run the reference implementation.
block_indicies
=
torch
.
div
(
slot_mapping
,
block_size
,
rounding_mode
=
"floor"
)
...
...
@@ -270,8 +290,18 @@ def test_reshape_and_cache_flash(
cloned_key_cache
[
block_idx
,
block_offset
,
:,
:]
=
key
[
i
]
cloned_value_cache
[
block_idx
,
block_offset
,
:,
:]
=
value
[
i
]
assert
torch
.
allclose
(
key_cache
,
cloned_key_cache
)
assert
torch
.
allclose
(
value_cache
,
cloned_value_cache
)
if
kv_cache_dtype
==
"fp8"
:
assert
torch
.
allclose
(
result_key_cache
,
cloned_key_cache
,
atol
=
0.001
,
rtol
=
0.1
)
assert
torch
.
allclose
(
result_value_cache
,
cloned_value_cache
,
atol
=
0.001
,
rtol
=
0.1
)
else
:
assert
torch
.
allclose
(
key_cache
,
cloned_key_cache
)
assert
torch
.
allclose
(
value_cache
,
cloned_value_cache
)
@
pytest
.
mark
.
parametrize
(
"direction"
,
COPYING_DIRECTION
)
...
...
@@ -300,6 +330,8 @@ def test_swap_blocks(
)
->
None
:
if
kv_cache_dtype
==
"fp8"
and
"cpu"
in
direction
:
pytest
.
skip
()
if
kv_cache_dtype
==
"fp8"
and
head_size
%
16
:
pytest
.
skip
()
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
...
...
tests/kernels/test_cutlass.py
View file @
e661d594
...
...
@@ -106,8 +106,8 @@ def cutlass_int8_gemm_helper(m: int,
assert
torch
.
allclose
(
out
,
baseline
,
rtol
=
1e-1
,
atol
=
1e0
)
@
pytest
.
mark
.
parametrize
(
"m"
,
[
512
,
222
,
100
,
33
,
1
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
2048
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"m"
,
[
1
,
16
,
32
,
64
,
128
,
256
,
512
,
222
,
100
,
33
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
2048
,
4096
,
8192
,
16384
,
24576
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
496
,
1024
])
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
...
...
@@ -119,8 +119,8 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
cutlass_fp8_gemm_helper
(
m
,
n
,
k
,
per_act_token
,
per_out_ch
,
use_bias
)
@
pytest
.
mark
.
parametrize
(
"m"
,
[
512
,
222
,
33
,
1
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
2048
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"m"
,
[
1
,
16
,
32
,
64
,
128
,
256
,
512
,
222
,
33
,
1
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
2048
,
8192
,
16384
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
496
,
1024
])
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
...
...
tests/kernels/test_flash_attn.py
View file @
e661d594
...
...
@@ -20,6 +20,7 @@ def ref_paged_attn(
block_tables
:
torch
.
Tensor
,
scale
:
float
,
sliding_window
:
Optional
[
int
]
=
None
,
soft_cap
:
Optional
[
float
]
=
None
,
)
->
torch
.
Tensor
:
num_seqs
=
len
(
query_lens
)
block_tables
=
block_tables
.
cpu
().
numpy
()
...
...
@@ -53,6 +54,8 @@ def ref_paged_attn(
(
query_len
+
sliding_window
)
+
1
).
bool
().
logical_not
()
mask
|=
sliding_window_mask
if
soft_cap
is
not
None
:
attn
=
soft_cap
*
torch
.
tanh
(
attn
/
soft_cap
)
attn
.
masked_fill_
(
mask
,
float
(
"-inf"
))
attn
=
torch
.
softmax
(
attn
,
dim
=-
1
).
to
(
v
.
dtype
)
out
=
torch
.
einsum
(
"hqk,khd->qhd"
,
attn
,
v
)
...
...
@@ -68,13 +71,15 @@ def ref_paged_attn(
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
torch
.
inference_mode
@
pytest
.
mark
.
parametrize
(
"soft_cap"
,
[
None
,
10.0
,
50.0
])
@
torch
.
inference_mode
()
def
test_flash_attn_with_paged_kv
(
kv_lens
:
List
[
int
],
num_heads
:
Tuple
[
int
,
int
],
head_size
:
int
,
dtype
:
torch
.
dtype
,
block_size
:
int
,
soft_cap
:
Optional
[
float
],
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
0
)
...
...
@@ -108,6 +113,7 @@ def test_flash_attn_with_paged_kv(
causal
=
True
,
block_table
=
block_tables
,
cache_seqlens
=
kv_lens_tensor
,
softcap
=
soft_cap
if
soft_cap
is
not
None
else
0
,
).
squeeze
(
1
)
ref_output
=
ref_paged_attn
(
...
...
@@ -118,6 +124,7 @@ def test_flash_attn_with_paged_kv(
kv_lens
=
kv_lens
,
block_tables
=
block_tables
,
scale
=
scale
,
soft_cap
=
soft_cap
,
)
assert
torch
.
allclose
(
output
,
ref_output
,
atol
=
1e-2
,
rtol
=
1e-2
),
\
f
"
{
torch
.
max
(
torch
.
abs
(
output
-
ref_output
))
}
"
...
...
@@ -129,7 +136,8 @@ def test_flash_attn_with_paged_kv(
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"sliding_window"
,
[
None
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
torch
.
inference_mode
@
pytest
.
mark
.
parametrize
(
"soft_cap"
,
[
None
,
10.0
,
50.0
])
@
torch
.
inference_mode
()
def
test_varlen_with_paged_kv
(
seq_lens
:
List
[
Tuple
[
int
,
int
]],
num_heads
:
Tuple
[
int
,
int
],
...
...
@@ -137,6 +145,7 @@ def test_varlen_with_paged_kv(
sliding_window
:
Optional
[
int
],
dtype
:
torch
.
dtype
,
block_size
:
int
,
soft_cap
:
Optional
[
float
],
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
0
)
...
...
@@ -163,10 +172,6 @@ def test_varlen_with_paged_kv(
head_size
,
dtype
=
dtype
)
value_cache
=
torch
.
randn_like
(
key_cache
)
# Normalize the scale of the key and value caches to mitigate
# numerical instability.
key_cache
/=
head_size
**
0.5
value_cache
/=
head_size
**
0.5
cu_query_lens
=
torch
.
tensor
([
0
]
+
query_lens
,
dtype
=
torch
.
int32
).
cumsum
(
dim
=
0
,
dtype
=
torch
.
int32
)
...
...
@@ -192,6 +197,7 @@ def test_varlen_with_paged_kv(
causal
=
True
,
window_size
=
window_size
,
block_table
=
block_tables
,
softcap
=
soft_cap
if
soft_cap
is
not
None
else
0
,
)
ref_output
=
ref_paged_attn
(
...
...
@@ -203,6 +209,7 @@ def test_varlen_with_paged_kv(
block_tables
=
block_tables
,
scale
=
scale
,
sliding_window
=
sliding_window
,
soft_cap
=
soft_cap
,
)
assert
torch
.
allclose
(
output
,
ref_output
,
atol
=
1e-2
,
rtol
=
1e-2
),
\
f
"
{
torch
.
max
(
torch
.
abs
(
output
-
ref_output
))
}
"
tests/kernels/test_int8_quant.py
View file @
e661d594
import
pytest
import
torch
# ruff: noqa: F401
import
vllm._C
from
tests.kernels.quant_utils
import
ref_dynamic_per_token_quant
from
vllm._custom_ops
import
scaled_int8_quant
...
...
tests/kernels/test_marlin_gemm.py
View file @
e661d594
...
...
@@ -9,11 +9,14 @@ from tests.quantization.utils import is_quant_method_supported
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.quantization.gptq_marlin_24
import
(
GPTQ_MARLIN_24_MAX_PARALLEL
,
GPTQ_MARLIN_24_MIN_THREAD_N
,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
,
GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
)
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
,
GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
)
from
vllm.model_executor.layers.quantization.qqq
import
(
MARLIN_QQQ_MAX_PARALLEL
,
MARLIN_QQQ_MIN_THREAD_N
,
MARLIN_QQQ_SUPPORTED_GROUP_SIZES
,
MARLIN_QQQ_SUPPORTED_NUM_BITS
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils
import
(
GPTQ_MARLIN_MAX_PARALLEL
,
GPTQ_MARLIN_MIN_THREAD_N
,
MARLIN_SUPPORTED_GROUP_SIZES
,
MARLIN_SUPPORTED_NUM_BITS
,
marlin_
make_empty_g_idx
,
marlin_permute_scal
es
)
MARLIN_SUPPORTED_GROUP_SIZES
,
marlin_make_empty_g_idx
,
marlin_
permute_scales
,
query_marlin_supported_quant_typ
es
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_fp8
import
(
pack_fp8_to_int32
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test
import
(
...
...
@@ -21,12 +24,14 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
marlin_weights
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test_24
import
(
marlin_24_quantize
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq
import
(
# noqa: E501
marlin_qqq_quantize
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
awq_pack
,
gptq_pack
,
quantize_weights
,
quantize_weights_with_zp
,
sort_weights
)
awq_pack
,
gptq_pack
,
gptq_quantize_weights
,
quantize_weights
,
sort_weights
)
ACT_ORDER_OPTS
=
[
False
,
True
]
K_FULL_OPTS
=
[
False
,
True
]
USE_FP32_REDUCE_OPTS
=
[
False
,
True
]
MARLIN_K_CHUNKS
=
[
128
]
MARLIN_N_CHUNKS
=
[
64
,
128
,
256
]
...
...
@@ -59,12 +64,13 @@ def rand_data(shape, dtype=torch.float16):
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"quant_type"
,
query_marlin_supported_quant_types
(
False
))
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"act_order"
,
ACT_ORDER_OPTS
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_gptq_marlin_repack
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
act_order
,
mnk_factors
):
def
test_gptq_marlin_repack
(
k_chunk
,
n_chunk
,
quant_type
,
group_size
,
act_order
,
mnk_factors
):
m_factor
,
n_factor
,
k_factor
=
mnk_factors
size_m
=
m_factor
...
...
@@ -89,11 +95,11 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
b_weight
=
rand_data
((
size_k
,
size_n
))
# Quantize (and apply act_order if provided)
w_ref
,
q_w
,
s
,
g_idx
,
rand_perm
=
quantize_weights
(
b_weight
,
num_bits
,
group_size
,
act_order
)
w_ref
,
q_w
,
s
,
g_idx
,
rand_perm
=
gptq_
quantize_weights
(
b_weight
,
quant_type
,
group_size
,
act_order
)
# Pack to GPTQ format
q_w_gptq
=
gptq_pack
(
q_w
,
num
_bits
,
size_k
,
size_n
)
q_w_gptq
=
gptq_pack
(
q_w
,
quant_type
.
size
_bits
,
size_k
,
size_n
)
# For act_order, sort the "weights" and "g_idx" so that group ids are
# increasing
...
...
@@ -102,8 +108,9 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
q_w
,
g_idx
,
sort_indices
=
sort_weights
(
q_w
,
g_idx
)
# Pack to Marlin format
weight_perm
=
get_weight_perm
(
num_bits
)
marlin_q_w_1
=
marlin_weights
(
q_w
,
size_k
,
size_n
,
num_bits
,
weight_perm
)
weight_perm
=
get_weight_perm
(
quant_type
.
size_bits
)
marlin_q_w_1
=
marlin_weights
(
q_w
,
size_k
,
size_n
,
quant_type
.
size_bits
,
weight_perm
)
# Run Marlin repack GPU kernel
marlin_q_w_2
=
ops
.
gptq_marlin_repack
(
...
...
@@ -111,7 +118,7 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
sort_indices
,
size_k
,
size_n
,
num
_bits
,
quant_type
.
size
_bits
,
)
torch
.
cuda
.
synchronize
()
...
...
@@ -122,10 +129,11 @@ def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"quant_type"
,
query_marlin_supported_quant_types
(
False
))
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_awq_marlin_repack
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
def
test_awq_marlin_repack
(
k_chunk
,
n_chunk
,
quant_type
,
group_size
,
mnk_factors
):
m_factor
,
n_factor
,
k_factor
=
mnk_factors
...
...
@@ -144,22 +152,25 @@ def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
b_weight
=
rand_data
((
size_k
,
size_n
))
# Quantize
w_ref
,
q_w
,
s
,
zp
=
quantize_weights_with_zp
(
b_weight
,
num_bits
,
group_size
)
w_ref
,
q_w
,
s
,
zp
=
quantize_weights
(
b_weight
,
quant_type
,
group_size
,
zero_points
=
True
)
# Pack to AWQ format
q_w_awq
=
awq_pack
(
q_w
,
num
_bits
,
size_k
,
size_n
)
q_w_awq
=
awq_pack
(
q_w
,
quant_type
.
size
_bits
,
size_k
,
size_n
)
# Pack to Marlin format
weight_perm
=
get_weight_perm
(
num_bits
)
marlin_q_w_1
=
marlin_weights
(
q_w
,
size_k
,
size_n
,
num_bits
,
weight_perm
)
weight_perm
=
get_weight_perm
(
quant_type
.
size_bits
)
marlin_q_w_1
=
marlin_weights
(
q_w
,
size_k
,
size_n
,
quant_type
.
size_bits
,
weight_perm
)
# Run Marlin repack GPU kernel
marlin_q_w_2
=
ops
.
awq_marlin_repack
(
q_w_awq
,
size_k
,
size_n
,
num
_bits
,
quant_type
.
size
_bits
,
)
torch
.
cuda
.
synchronize
()
...
...
@@ -170,19 +181,22 @@ def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"quant_type"
,
query_marlin_supported_quant_types
(
False
))
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"act_order"
,
ACT_ORDER_OPTS
)
@
pytest
.
mark
.
parametrize
(
"is_k_full"
,
K_FULL_OPTS
)
@
pytest
.
mark
.
parametrize
(
"use_fp32_reduce"
,
USE_FP32_REDUCE_OPTS
)
def
test_gptq_marlin_gemm
(
k_chunk
,
n_chunk
,
num_bits
,
quant_type
,
group_size
,
mnk_factors
,
act_order
,
is_k_full
,
use_fp32_reduce
,
):
m_factor
,
n_factor
,
k_factor
=
mnk_factors
...
...
@@ -203,7 +217,7 @@ def test_gptq_marlin_gemm(
b_weight
=
rand_data
((
size_k
,
size_n
))
w_ref
,
marlin_q_w
,
marlin_s
,
g_idx
,
sort_indices
,
_
=
marlin_quantize
(
b_weight
,
num_bits
,
group_size
,
act_order
)
b_weight
,
quant_type
,
group_size
,
act_order
)
marlin_zp
=
marlin_make_empty_g_idx
(
marlin_s
.
device
)
...
...
@@ -218,12 +232,13 @@ def test_gptq_marlin_gemm(
g_idx
,
sort_indices
,
workspace
.
scratch
,
num_bits
,
quant_type
,
a_input
.
shape
[
0
],
b_weight
.
shape
[
1
],
a_input
.
shape
[
1
],
is_k_full
,
is_k_full
=
is_k_full
,
has_zp
=
False
,
use_fp32_reduce
=
use_fp32_reduce
,
)
output_ref
=
torch
.
matmul
(
a_input
,
w_ref
)
...
...
@@ -239,10 +254,10 @@ def test_gptq_marlin_gemm(
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_24_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_24_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"
num_bits
"
,
GPTQ_MARLIN_24_SUPPORTED_
NUM_BIT
S
)
@
pytest
.
mark
.
parametrize
(
"
quant_type
"
,
GPTQ_MARLIN_24_SUPPORTED_
QUANT_TYPE
S
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_gptq_marlin_24_gemm
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
def
test_gptq_marlin_24_gemm
(
k_chunk
,
n_chunk
,
quant_type
,
group_size
,
mnk_factors
):
m_factor
,
n_factor
,
k_factor
=
mnk_factors
...
...
@@ -257,7 +272,7 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size,
b_weight
=
rand_data
((
size_k
,
size_n
))
(
w_24_ref
,
marlin_24_q_w_comp
,
marlin_24_meta
,
marlin_24_s
)
=
marlin_24_quantize
(
b_weight
,
num_bits
,
group_size
)
marlin_24_s
)
=
marlin_24_quantize
(
b_weight
,
quant_type
,
group_size
)
workspace_24
=
MarlinWorkspace
(
size_n
,
GPTQ_MARLIN_24_MIN_THREAD_N
,
GPTQ_MARLIN_24_MAX_PARALLEL
)
...
...
@@ -270,7 +285,7 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size,
marlin_24_meta
,
marlin_24_s
,
workspace_24
.
scratch
,
num_bits
,
quant_type
,
a_input
.
shape
[
0
],
b_weight
.
shape
[
1
],
a_input
.
shape
[
1
],
...
...
@@ -362,15 +377,18 @@ def test_fp8_marlin_gemm(
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"quant_type"
,
query_marlin_supported_quant_types
(
True
))
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"use_fp32_reduce"
,
USE_FP32_REDUCE_OPTS
)
def
test_awq_marlin_gemm
(
k_chunk
,
n_chunk
,
num_bits
,
quant_type
,
group_size
,
mnk_factors
,
use_fp32_reduce
,
):
m_factor
,
n_factor
,
k_factor
=
mnk_factors
...
...
@@ -385,7 +403,7 @@ def test_awq_marlin_gemm(
b_weight
=
rand_data
((
size_k
,
size_n
))
w_ref
,
marlin_q_w
,
marlin_s
,
marlin_zp
=
awq_marlin_quantize
(
b_weight
,
num_bits
,
group_size
)
b_weight
,
quant_type
,
group_size
)
g_idx
=
torch
.
empty
(
0
,
dtype
=
torch
.
int
,
device
=
marlin_q_w
.
device
)
sort_indices
=
torch
.
empty
(
0
,
dtype
=
torch
.
int
,
device
=
marlin_q_w
.
device
)
...
...
@@ -403,12 +421,13 @@ def test_awq_marlin_gemm(
g_idx
,
sort_indices
,
workspace
.
scratch
,
num_bits
,
quant_type
,
a_input
.
shape
[
0
],
b_weight
.
shape
[
1
],
a_input
.
shape
[
1
],
is_k_full
,
has_zp
,
is_k_full
=
is_k_full
,
has_zp
=
has_zp
,
use_fp32_reduce
=
use_fp32_reduce
,
)
output_ref
=
torch
.
matmul
(
a_input
,
w_ref
)
...
...
@@ -418,3 +437,64 @@ def test_awq_marlin_gemm(
print
(
"max_diff = {}"
.
format
(
max_diff
))
assert
max_diff
<
0.04
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"qqq"
),
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_QQQ_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_QQQ_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_marlin_qqq_gemm
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
mnk_factors
,
):
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
m_factor
,
n_factor
,
k_factor
=
mnk_factors
size_m
=
m_factor
size_k
=
k_chunk
*
k_factor
size_n
=
n_chunk
*
n_factor
print
(
f
"MNK =
{
size_m
}
{
size_n
}
{
size_k
}
"
)
print
(
f
"groupsize =
{
group_size
}
"
)
a_input
=
rand_data
((
size_m
,
size_k
))
b_weight
=
rand_data
((
size_k
,
size_n
))
# Quantize activations
s_a
=
a_input
.
abs
().
max
(
dim
=-
1
,
keepdim
=
True
)[
0
].
div
(
int8_traits
.
max
).
to
(
torch
.
float
)
q_a
=
(
a_input
/
s_a
).
round
().
clamp
(
int8_traits
.
min
,
int8_traits
.
max
).
to
(
torch
.
int8
)
# Quantize weights
w_ref
,
marlin_qqq_q_w
,
marlin_qqq_s_group
,
marlin_qqq_s_channel
=
\
marlin_qqq_quantize
(
b_weight
,
num_bits
,
group_size
)
workspace
=
MarlinWorkspace
(
size_n
,
MARLIN_QQQ_MIN_THREAD_N
,
MARLIN_QQQ_MAX_PARALLEL
)
output
=
ops
.
marlin_qqq_gemm
(
q_a
,
marlin_qqq_q_w
,
s_a
,
marlin_qqq_s_channel
,
marlin_qqq_s_group
,
workspace
.
scratch
,
a_input
.
shape
[
0
],
b_weight
.
shape
[
1
],
a_input
.
shape
[
1
],
)
output_ref
=
torch
.
matmul
(
q_a
.
half
()
*
s_a
.
half
(),
w_ref
)
torch
.
cuda
.
synchronize
()
max_diff
=
compute_max_diff
(
output
,
output_ref
)
print
(
"max_diff = {}"
.
format
(
max_diff
))
assert
max_diff
<
0.04
tests/kernels/test_pos_encoding.py
View file @
e661d594
...
...
@@ -10,7 +10,7 @@ from .allclose_default import get_default_atol, get_default_rtol
IS_NEOX_STYLE
=
[
True
,
False
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
192
,
256
]
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
120
,
128
,
192
,
256
]
ROTARY_DIMS
=
[
None
,
32
]
# None means rotary dim == head size
NUM_HEADS
=
[
7
,
17
]
# Arbitrary values for testing
BATCH_SIZES
=
[
1
,
5
]
# Arbitrary values for testing
...
...
tests/kernels/test_sampler.py
View file @
e661d594
import
gc
from
unittest.mock
import
patch
import
pytest
import
torch
import
triton
import
triton.language
as
tl
from
vllm.model_executor.layers.ops.sample
import
(
MAX_TRITON_N_COLS
,
_uniform_to_exponential
,
get_num_triton_sampler_splits
,
sample
)
from
vllm.model_executor.layers.ops.sample
import
(
_sample_triton
,
_uniform_to_exponential
,
sample
)
from
vllm.model_executor.sampling_metadata
import
SamplingTensors
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.triton_utils.libentry
import
LibEntry
from
vllm.triton_utils.sample
import
(
MAX_TRITON_N_COLS
,
get_num_triton_sampler_splits
)
SINGLE_SPLIT_VOCAB_SIZE
=
32000
# llama/mistral/mixtral vocab size
MULTI_SPLIT_VOCAB_SIZE
=
MAX_TRITON_N_COLS
+
100
...
...
@@ -75,15 +79,20 @@ def test_sample_decoding_only(random_sampling, max_best_of,
seeds
=
torch
.
randint
(
1
,
torch
.
iinfo
(
torch
.
long
).
max
,
(
n_splits
,
bs
),
device
=
"cuda"
).
mul_
(
random_sampling_mask
)
sampled_tokens
,
sampled_logprobs
,
sampled_modified_probs
=
sample
(
probs
=
probs
,
logprobs
=
logprobs
,
sample_indices
=
sample_indices
,
seeds
=
seeds
,
max_best_of
=
max_best_of
,
modify_greedy_probs
=
modify_greedy_probs
,
save_logprobs
=
save_logprobs
,
_save_modified_probs
=
True
)
#The current _sample_triton does not utilize the
# libentry decoration. The purpose of adding this patch is to test
# the correctness of libentry.
with
patch
(
"vllm.model_executor.layers.ops.sample._sample_triton"
,
LibEntry
(
_sample_triton
)):
sampled_tokens
,
sampled_logprobs
,
sampled_modified_probs
=
sample
(
probs
=
probs
,
logprobs
=
logprobs
,
sample_indices
=
sample_indices
,
seeds
=
seeds
,
max_best_of
=
max_best_of
,
modify_greedy_probs
=
modify_greedy_probs
,
save_logprobs
=
save_logprobs
,
_save_modified_probs
=
True
)
assert
sampled_tokens
.
shape
==
(
bs
,
max_best_of
)
for
i
in
range
(
bs
):
assert
torch
.
all
(
sampled_tokens
[
i
]
==
i
*
(
vocab_size
//
bs
))
...
...
@@ -129,6 +138,7 @@ def test_sample_decoding_only(random_sampling, max_best_of,
[
SINGLE_SPLIT_VOCAB_SIZE
,
MULTI_SPLIT_VOCAB_SIZE
])
def
test_sample_prompt_logprobs
(
random_sampling
,
max_best_of
,
modify_greedy_probs
,
seed
,
vocab_size
):
set_random_seed
(
seed
)
prompt_sizes
=
[
16
,
32
,
64
,
128
]
*
2
samples
=
8
...
...
@@ -156,14 +166,17 @@ def test_sample_prompt_logprobs(random_sampling, max_best_of,
seeds
=
torch
.
randint
(
1
,
torch
.
iinfo
(
torch
.
long
).
max
,
(
n_splits
,
samples
),
device
=
"cuda"
).
mul_
(
random_sampling_mask
)
sampled_tokens
,
sampled_logprobs
,
_
=
sample
(
probs
=
probs
,
logprobs
=
logprobs
,
sample_indices
=
sample_indices
,
seeds
=
seeds
,
max_best_of
=
max_best_of
,
modify_greedy_probs
=
modify_greedy_probs
,
save_logprobs
=
True
)
#ditto
with
patch
(
"vllm.model_executor.layers.ops.sample._sample_triton"
,
LibEntry
(
_sample_triton
)):
sampled_tokens
,
sampled_logprobs
,
_
=
sample
(
probs
=
probs
,
logprobs
=
logprobs
,
sample_indices
=
sample_indices
,
seeds
=
seeds
,
max_best_of
=
max_best_of
,
modify_greedy_probs
=
modify_greedy_probs
,
save_logprobs
=
True
)
assert
sampled_tokens
.
shape
==
(
samples
,
max_best_of
)
assert
sampled_logprobs
.
shape
==
(
samples
,
max_best_of
)
for
i
,
t
in
enumerate
(
sample_indices
):
...
...
tests/lora/test_gemma.py
View file @
e661d594
...
...
@@ -37,7 +37,7 @@ def test_gemma_lora(gemma_lora_files):
expected_lora_output
=
[
"more important than knowledge.
\n
Author: Albert Einstein
\n
"
,
"everyone else is already taken.
\n
Author: Oscar Wilde
\n
"
,
"so little time
\n
Author: Frank Zappa
\n
"
,
"so little time
.
\n
Author: Frank Zappa
\n
"
,
]
output1
=
do_sample
(
llm
,
gemma_lora_files
,
lora_id
=
1
)
...
...
tests/lora/test_layers.py
View file @
e661d594
...
...
@@ -22,14 +22,17 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA
,
MergedQKVParallelLinearWithLora
,
QKVParallelLinearWithLora
,
ReplicatedLinearWithLoRA
,
RowParallelLinearWithLoRA
,
VocabParallelEmbeddingWithLoRA
)
# yapf: enable
from
vllm.lora.models
import
(
LongContextLoRAContext
,
LoRALayerWeights
,
PackedLoRALayerWeights
,
convert_mapping
)
PackedLoRALayerWeights
)
from
vllm.lora.punica
import
PunicaWrapper
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
MergedColumnParallelLinear
,
QKVParallelLinear
,
ReplicatedLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
...
...
@@ -47,6 +50,9 @@ TOLERANCES = {
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
# We will launch different triton kernels between the prefill and decode
# stages, so we need to verify this. prefill stage(True) or decode stage(False)
STAGES
=
[
True
,
False
]
def
get_random_id_to_index
(
num_loras
:
int
,
...
...
@@ -182,10 +188,12 @@ def create_random_inputs(
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
128000
])
def
test_embeddings
(
dist_init
,
num_loras
,
device
,
vocab_size
)
->
None
:
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_embeddings
(
dist_init
,
num_loras
,
device
,
vocab_size
,
stage
)
->
None
:
torch
.
set_default_device
(
device
)
max_loras
=
8
punica_wrapper
=
PunicaWrapper
(
8192
,
256
,
device
)
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
)
...
...
@@ -204,7 +212,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
id_to_index
=
get_random_id_to_index
(
num_loras
,
max_loras
)
embedding
,
lora_embedding
=
create_random_embedding_layer
()
lora_embedding
.
set_mapping
(
punica_wrapper
)
lora_dict
,
_
=
populate_loras
(
id_to_index
,
layer
=
lora_embedding
,
...
...
@@ -217,12 +225,12 @@ def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
input_size
=
(
200
,
),
input_range
=
(
1
,
vocab_size
),
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
,
is_prefill
=
stage
)
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
vocab_size
,
lora_config
.
lora_extra_vocab_size
)
lora_embedding
.
set_mapping
(
*
mapping_info
)
lora_result
=
lora_embedding
(
torch
.
cat
(
inputs
))
...
...
@@ -255,12 +263,12 @@ def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
input_size
=
(
200
,
),
input_range
=
(
1
,
vocab_size
),
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
,
is_prefill
=
stage
)
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
vocab_size
,
lora_config
.
lora_extra_vocab_size
)
lora_embedding
.
set_mapping
(
*
mapping_info
,
)
lora_result
=
lora_embedding
(
torch
.
cat
(
inputs
))
expected_result
=
embedding
(
torch
.
cat
(
inputs
))
...
...
@@ -278,11 +286,13 @@ def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
128000
])
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_embeddings_with_new_embeddings
(
dist_init
,
num_loras
,
device
,
vocab_size
)
->
None
:
vocab_size
,
stage
)
->
None
:
torch
.
set_default_device
(
device
)
max_loras
=
8
punica_wrapper
=
PunicaWrapper
(
8192
,
256
,
device
)
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
)
...
...
@@ -318,6 +328,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
generate_embeddings_tensor
=
256
,
)
lora_embedding
.
set_mapping
(
punica_wrapper
)
# All embeddings tensors have the same shape.
embeddings_tensors
=
[
lora_dict
[
id
].
embeddings_tensor
for
id
in
sorted
(
lora_dict
.
keys
())
...
...
@@ -334,8 +345,12 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
input_size
=
(
200
,
),
input_range
=
(
1
,
vocab_size
),
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
,
is_prefill
=
stage
)
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
vocab_size
,
lora_config
.
lora_extra_vocab_size
)
original_inputs
=
deepcopy
(
inputs
)
# Force some of the inputs to be in the extended embeddings range
...
...
@@ -349,11 +364,6 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
(
embedding_id
+
1
)
*
embeddings_tensor_len
-
1
)
original_input_
[
-
2
]
=
vocab_size
+
embeddings_tensor_len
-
1
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
vocab_size
,
lora_config
.
lora_extra_vocab_size
)
lora_embedding
.
set_mapping
(
*
mapping_info
,
)
expanded_embedding
.
weight
[
vocab_size
:
vocab_size
+
(
embeddings_tensor_len
*
max_loras
)]
=
torch
.
cat
(
embeddings_tensors
)
...
...
@@ -390,15 +400,13 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
input_size
=
(
200
,
),
input_range
=
(
1
,
vocab_size
),
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
original_inputs
=
deepcopy
(
inputs
)
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
,
is_prefill
=
stage
)
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
vocab_size
,
lora_config
.
lora_extra_vocab_size
)
lora_embedding
.
set_mapping
(
*
mapping_info
,
)
lora_result
=
lora_embedding
(
torch
.
cat
(
original_inputs
))
expected_result
=
expanded_embedding
(
torch
.
cat
(
inputs
))
...
...
@@ -413,11 +421,13 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
512
,
32000
,
64000
,
128000
])
def
test_lm_head_logits_processor
(
dist_init
,
num_loras
,
device
,
vocab_size
)
->
None
:
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_lm_head_logits_processor
(
dist_init
,
num_loras
,
device
,
vocab_size
,
stage
)
->
None
:
torch
.
set_default_device
(
device
)
max_loras
=
8
punica_wrapper
=
PunicaWrapper
(
8192
,
256
,
device
)
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
)
...
...
@@ -443,7 +453,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
id_to_index
=
get_random_id_to_index
(
num_loras
,
max_loras
)
linear
,
logits_processor
,
lora_logits_processor
=
_pretest
()
lora_logits_processor
.
set_mapping
(
punica_wrapper
)
# NOTE: all the generated loras share the same embeddings tensor.
lora_dict
,
_
=
populate_loras
(
id_to_index
,
...
...
@@ -461,17 +471,17 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
input_range
=
(
0
,
1
),
input_type
=
torch
.
float16
,
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
input_
=
torch
.
rand
(
20
,
1024
)
mapping_info
=
convert_mapping
(
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
,
is_prefill
=
stage
)
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
vocab_size
,
lora_config
.
lora_extra_vocab_size
,
)
lora_logits_processor
.
set_mapping
(
*
mapping_info
,
)
input_
=
torch
.
rand
(
20
,
1024
)
lora_result
=
lora_logits_processor
.
_get_logits
(
hidden_states
=
torch
.
cat
(
inputs
),
...
...
@@ -510,12 +520,16 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
input_range
=
(
0
,
1
),
input_type
=
torch
.
float16
,
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
vocab_size
,
lora_config
.
lora_extra_vocab_size
)
lora_logits_processor
.
set_mapping
(
*
mapping_info
,
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
,
is_prefill
=
stage
)
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
vocab_size
,
lora_config
.
lora_extra_vocab_size
,
)
lora_result
=
lora_logits_processor
.
_get_logits
(
hidden_states
=
torch
.
cat
(
inputs
),
...
...
@@ -533,15 +547,118 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
atol
=
atol
)
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_linear_replicated
(
dist_init
,
num_loras
,
device
,
stage
)
->
None
:
torch
.
set_default_device
(
device
)
punica_wrapper
=
PunicaWrapper
(
8192
,
256
,
device
)
max_loras
=
8
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
lora_dtype
=
torch
.
float16
)
def
create_random_linear_replicated_layer
():
linear
=
ReplicatedLinear
(
4096
,
4096
,
bias
=
False
,
params_dtype
=
torch
.
float16
)
linear
.
weight
.
data
=
torch
.
rand_like
(
linear
.
weight
.
data
)
lora_linear
=
ReplicatedLinearWithLoRA
(
linear
)
lora_linear
.
create_lora_weights
(
max_loras
,
lora_config
)
return
linear
,
lora_linear
for
i
in
range
(
10
):
set_random_seed
(
i
)
id_to_index
=
get_random_id_to_index
(
num_loras
,
max_loras
)
linear
,
lora_linear
=
create_random_linear_replicated_layer
()
lora_linear
.
set_mapping
(
punica_wrapper
)
lora_dict
,
_
=
populate_loras
(
id_to_index
,
layer
=
lora_linear
,
layer_weights
=
linear
.
weight
,
)
inputs
,
index_mapping
,
prompt_mapping
=
create_random_inputs
(
active_lora_ids
=
list
(
lora_dict
.
keys
()),
num_inputs
=
32
*
num_loras
,
input_size
=
(
1
,
4096
),
input_range
=
(
0
,
1
),
input_type
=
torch
.
float16
,
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
,
is_prefill
=
stage
)
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
512
,
lora_config
.
lora_extra_vocab_size
,
)
lora_result
=
lora_linear
(
torch
.
cat
(
inputs
))[
0
]
expected_results
:
List
[
torch
.
Tensor
]
=
[]
for
input_
,
lora_id
in
zip
(
inputs
,
prompt_mapping
):
lora
=
lora_dict
[
lora_id
]
result
=
linear
(
input_
)[
0
]
result
+=
input_
@
lora
.
lora_a
@
lora
.
lora_b
*
lora
.
scaling
expected_results
.
append
(
result
)
expected_result
=
torch
.
cat
(
expected_results
)
rtol
,
atol
=
TOLERANCES
[
lora_result
.
dtype
]
assert
torch
.
allclose
(
lora_result
,
expected_result
,
rtol
=
rtol
,
atol
=
atol
)
# Check that resetting the lora weights succeeds
for
slot_idx
in
range
(
max_loras
):
lora_linear
.
reset_lora
(
slot_idx
)
inputs
,
index_mapping
,
prompt_mapping
=
create_random_inputs
(
active_lora_ids
=
[
0
],
num_inputs
=
32
*
num_loras
,
input_size
=
(
1
,
4096
),
input_range
=
(
0
,
1
),
input_type
=
torch
.
float16
,
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
,
is_prefill
=
stage
)
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
512
,
lora_config
.
lora_extra_vocab_size
)
lora_result
=
lora_linear
(
torch
.
cat
(
inputs
))[
0
]
expected_result
=
linear
(
torch
.
cat
(
inputs
))[
0
]
rtol
,
atol
=
TOLERANCES
[
lora_result
.
dtype
]
assert
torch
.
allclose
(
lora_result
,
expected_result
,
rtol
=
rtol
,
atol
=
atol
)
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"orientation"
,
[
"row"
,
"column"
])
@
pytest
.
mark
.
parametrize
(
"fully_shard"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_linear_parallel
(
dist_init
,
num_loras
,
orientation
,
fully_shard
,
device
)
->
None
:
device
,
stage
)
->
None
:
torch
.
set_default_device
(
device
)
punica_wrapper
=
PunicaWrapper
(
8192
,
256
,
device
)
max_loras
=
8
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
...
...
@@ -575,7 +692,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
id_to_index
=
get_random_id_to_index
(
num_loras
,
max_loras
)
linear
,
lora_linear
=
create_random_linear_parallel_layer
()
lora_linear
.
set_mapping
(
punica_wrapper
)
lora_dict
,
_
=
populate_loras
(
id_to_index
,
layer
=
lora_linear
,
...
...
@@ -589,16 +706,16 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
input_range
=
(
0
,
1
),
input_type
=
torch
.
float16
,
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
mapping_info
=
convert_mapping
(
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
,
is_prefill
=
stage
)
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
512
,
lora_config
.
lora_extra_vocab_size
,
)
lora_linear
.
set_mapping
(
*
mapping_info
,
)
lora_result
=
lora_linear
(
torch
.
cat
(
inputs
))[
0
]
...
...
@@ -628,11 +745,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
input_range
=
(
0
,
1
),
input_type
=
torch
.
float16
,
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
,
is_prefill
=
stage
)
mapping_info
=
convert_mapping
(
lora_mapping
,
id_to_index
,
max_loras
,
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
512
,
lora_config
.
lora_extra_vocab_size
)
lora_linear
.
set_mapping
(
*
mapping_info
,
)
lora_result
=
lora_linear
(
torch
.
cat
(
inputs
))[
0
]
expected_result
=
linear
(
torch
.
cat
(
inputs
))[
0
]
...
...
@@ -649,10 +767,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
@
pytest
.
mark
.
parametrize
(
"repeats"
,
[
1
,
2
,
3
])
@
pytest
.
mark
.
parametrize
(
"fully_shard"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"stage"
,
STAGES
)
def
test_column_parallel_packed
(
dist_init
,
num_loras
,
repeats
,
fully_shard
,
device
)
->
None
:
device
,
stage
)
->
None
:
torch
.
set_default_device
(
device
)
punica_wrapper
=
PunicaWrapper
(
8192
,
256
,
device
)
max_loras
=
8
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
...
...
@@ -707,7 +827,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
id_to_index
=
get_random_id_to_index
(
num_loras
,
max_loras
)
linear
,
lora_linear
=
create_column_parallel_packed_layer
()
lora_linear
.
set_mapping
(
punica_wrapper
)
lora_dict
,
sublora_dict
=
populate_loras
(
id_to_index
,
layer
=
lora_linear
,
...
...
@@ -722,16 +842,17 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
input_range
=
(
0
,
1
),
input_type
=
torch
.
float16
,
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
,
is_prefill
=
stage
)
mapping_info
=
convert_mapping
(
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
512
,
lora_config
.
lora_extra_vocab_size
,
)
lora_linear
.
set_mapping
(
*
mapping_info
)
lora_result
=
lora_linear
(
torch
.
cat
(
inputs
))[
0
]
...
...
@@ -762,16 +883,18 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
input_range
=
(
0
,
1
),
input_type
=
torch
.
float16
,
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
,
is_prefill
=
stage
)
mapping_info
=
convert_mapping
(
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
512
,
lora_config
.
lora_extra_vocab_size
,
)
lora_linear
.
set_mapping
(
*
mapping_info
)
#
lora_linear.set_mapping(*mapping_info)
lora_result
=
lora_linear
(
torch
.
cat
(
inputs
))[
0
]
expected_result
=
linear
(
torch
.
cat
(
inputs
))[
0
]
...
...
@@ -803,7 +926,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
torch
.
set_default_device
(
device
)
punica_wrapper
=
PunicaWrapper
(
8192
,
256
,
device
)
max_loras
=
8
lora_config
=
LoRAConfig
(
max_loras
=
max_loras
,
max_lora_rank
=
8
,
...
...
@@ -825,6 +948,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
is_neox_style
,
)
lora_rope
=
LinearScalingRotaryEmbeddingWithLora
(
rope
)
lora_rope
.
set_mapping
(
punica_wrapper
)
lora_rope
.
create_lora_weights
(
max_loras
,
lora_config
)
linear_rope
=
get_rope
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
{
...
...
@@ -840,6 +964,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
input_range
=
(
0
,
lora_config
.
lora_extra_vocab_size
),
input_type
=
torch
.
float16
,
)
lora_mapping
=
LoRAMapping
(
index_mapping
,
prompt_mapping
)
long_lora_context
=
LongContextLoRAContext
(
list
(
scaling_factors
),
rotary_dim
)
...
...
@@ -854,7 +979,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
for
i
in
range
(
len
(
scaling_factors
)):
long_lora_context
.
offsets_by_lora_id
[
i
]
=
scaling_factor_to_offset
.
get
(
scaling_factors
[
i
],
0
)
mapping_info
=
convert_mapping
(
punica_wrapper
.
update_metadata
(
lora_mapping
,
id_to_index
,
max_loras
,
...
...
@@ -862,7 +987,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
lora_config
.
lora_extra_vocab_size
,
long_lora_context
=
long_lora_context
,
)
lora_rope
.
set_mapping
(
*
mapping_info
)
#
lora_rope.set_mapping(*mapping_info)
positions
=
torch
.
randint
(
0
,
max_position
,
(
batch_size
,
seq_len
))
query
=
torch
.
randn
(
batch_size
,
...
...
tests/lora/test_lora.py
deleted
100644 → 0
View file @
6b16ea2e
import
pytest
import
torch
from
vllm.lora.layers
import
_apply_lora
,
_apply_lora_packed_nslice
from
.utils
import
DummyLoRAManager
TENSOR_SIZES
=
[
128
,
1024
,
2048
,
4096
,
8192
,
11008
,
11008
//
2
,
11008
//
4
]
QKV_TENSOR_SIZES
=
[
(
8192
,
1024
,
1024
),
(
8192
//
8
,
1024
//
8
,
1024
//
8
),
(
4096
,
4096
,
4096
),
(
4096
//
2
,
4096
//
2
,
4096
//
2
),
]
BATCH_SIZES
=
[
8
,
32
,
256
]
RANKS
=
[
8
]
DTYPES
=
[
torch
.
float16
]
TOLERANCES
=
{
torch
.
float16
:
(
5e-3
,
5e-3
),
torch
.
bfloat16
:
(
3e-2
,
2e-2
),
}
@
pytest
.
mark
.
parametrize
(
"m"
,
TENSOR_SIZES
)
@
pytest
.
mark
.
parametrize
(
"n"
,
TENSOR_SIZES
)
@
pytest
.
mark
.
parametrize
(
"k"
,
BATCH_SIZES
)
@
pytest
.
mark
.
parametrize
(
"rank"
,
RANKS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
def
test_apply_lora
(
m
,
n
,
k
,
rank
,
dtype
)
->
None
:
manager
=
DummyLoRAManager
()
module_name
=
"module"
weight
=
torch
.
rand
([
m
,
n
],
device
=
"cuda"
,
dtype
=
dtype
)
manager
.
init_random_lora
(
module_name
,
weight
,
rank
=
rank
)
lora
=
manager
.
get_module_lora
(
module_name
)
input
=
torch
.
rand
(
k
,
n
,
device
=
"cuda"
,
dtype
=
dtype
)
expected
=
input
@
lora
.
lora_a
@
lora
.
lora_b
*
lora
.
scaling
lora_a_stack
=
torch
.
zeros
(
8
,
1
,
lora
.
lora_a
.
shape
[
1
],
lora
.
lora_a
.
shape
[
0
],
device
=
"cuda"
,
dtype
=
dtype
)
lora_b_stack
=
torch
.
zeros
(
8
,
1
,
lora
.
lora_b
.
shape
[
1
],
lora
.
lora_b
.
shape
[
0
],
device
=
"cuda"
,
dtype
=
dtype
)
for
i
in
range
(
lora_a_stack
.
shape
[
0
]):
lora_a_stack
[
i
][
0
]
=
lora
.
lora_a
.
T
lora_b_stack
[
i
][
0
]
=
(
lora
.
lora_b
*
lora
.
scaling
).
T
output
=
torch
.
zeros
(
k
,
m
,
device
=
"cuda"
,
dtype
=
dtype
)
_apply_lora
(
input
,
lora_a_stack
,
lora_b_stack
,
torch
.
randint
(
0
,
lora_a_stack
.
shape
[
0
],
(
len
(
input
),
),
device
=
"cuda"
),
output
)
rtol
,
atol
=
TOLERANCES
[
dtype
]
assert
torch
.
allclose
(
expected
,
output
,
rtol
=
rtol
,
atol
=
atol
)
output
[:]
=
0
_apply_lora
(
input
,
lora_a_stack
,
lora_b_stack
,
torch
.
full
((
len
(
input
),
),
-
1
,
device
=
"cuda"
),
output
)
assert
torch
.
allclose
(
torch
.
zeros_like
(
output
),
output
)
manager
.
reset_lora
()
@
pytest
.
mark
.
parametrize
(
"m"
,
TENSOR_SIZES
)
@
pytest
.
mark
.
parametrize
(
"n"
,
TENSOR_SIZES
)
@
pytest
.
mark
.
parametrize
(
"k"
,
BATCH_SIZES
)
@
pytest
.
mark
.
parametrize
(
"rank"
,
RANKS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
def
test_apply_lora_packed_2slice
(
m
,
n
,
k
,
rank
,
dtype
)
->
None
:
if
m
%
2
!=
0
:
pytest
.
skip
(
"m must be divisible by 2"
)
if
m
//
2
not
in
TENSOR_SIZES
:
pytest
.
skip
(
"m//2 must be in TENSOR_SIZES"
)
manager
=
DummyLoRAManager
()
module_name
=
"module"
weight
=
torch
.
rand
([
m
//
2
,
n
],
device
=
"cuda"
,
dtype
=
dtype
)
manager
.
init_random_lora
(
module_name
+
"1"
,
weight
,
rank
=
rank
)
lora_1
=
manager
.
get_module_lora
(
module_name
+
"1"
)
manager
.
init_random_lora
(
module_name
+
"2"
,
weight
,
rank
=
rank
)
lora_2
=
manager
.
get_module_lora
(
module_name
+
"2"
)
input
=
torch
.
rand
(
k
,
n
,
device
=
"cuda"
,
dtype
=
dtype
)
expected
=
torch
.
cat
([
input
@
lora_1
.
lora_a
@
lora_1
.
lora_b
*
lora_1
.
scaling
,
input
@
lora_2
.
lora_a
@
lora_2
.
lora_b
*
lora_2
.
scaling
],
dim
=
1
)
lora_a_stacks
=
[
torch
.
zeros
(
8
,
1
,
lora_1
.
lora_a
.
shape
[
1
],
lora_1
.
lora_a
.
shape
[
0
],
device
=
"cuda"
,
dtype
=
dtype
)
for
i
in
range
(
2
)
]
lora_b_stacks
=
[
torch
.
zeros
(
8
,
1
,
lora_1
.
lora_b
.
shape
[
1
],
lora_1
.
lora_b
.
shape
[
0
],
device
=
"cuda"
,
dtype
=
dtype
)
for
i
in
range
(
2
)
]
for
i
in
range
(
lora_a_stacks
[
0
].
shape
[
0
]):
lora_a_stacks
[
0
][
i
][
0
]
=
lora_1
.
lora_a
.
T
lora_b_stacks
[
0
][
i
][
0
]
=
(
lora_1
.
lora_b
*
lora_1
.
scaling
).
T
lora_a_stacks
[
1
][
i
][
0
]
=
lora_2
.
lora_a
.
T
lora_b_stacks
[
1
][
i
][
0
]
=
(
lora_2
.
lora_b
*
lora_2
.
scaling
).
T
output
=
torch
.
zeros
(
k
,
m
,
device
=
"cuda"
,
dtype
=
dtype
)
_apply_lora_packed_nslice
(
input
,
lora_a_stacks
,
lora_b_stacks
,
torch
.
randint
(
0
,
lora_a_stacks
[
0
].
shape
[
0
],
(
len
(
input
),
),
device
=
"cuda"
),
output
,
(
m
//
2
,
m
//
2
))
rtol
,
atol
=
TOLERANCES
[
dtype
]
assert
torch
.
allclose
(
expected
,
output
,
rtol
=
rtol
,
atol
=
atol
)
output
[:]
=
0
_apply_lora_packed_nslice
(
input
,
lora_a_stacks
,
lora_b_stacks
,
torch
.
full
((
len
(
input
),
),
-
1
,
device
=
"cuda"
),
output
,
(
m
//
2
,
m
//
2
))
assert
torch
.
allclose
(
torch
.
zeros_like
(
output
),
output
)
manager
.
reset_lora
()
@
pytest
.
mark
.
parametrize
(
"qkv"
,
QKV_TENSOR_SIZES
)
@
pytest
.
mark
.
parametrize
(
"n"
,
TENSOR_SIZES
)
@
pytest
.
mark
.
parametrize
(
"k"
,
BATCH_SIZES
)
@
pytest
.
mark
.
parametrize
(
"rank"
,
RANKS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
def
test_apply_lora_packed_3slice
(
qkv
,
n
,
k
,
rank
,
dtype
)
->
None
:
manager
=
DummyLoRAManager
()
module_name
=
"module"
weight_q
=
torch
.
empty
(
qkv
[
0
],
n
,
device
=
"cuda"
,
dtype
=
dtype
)
weight_kv
=
torch
.
empty
(
qkv
[
1
],
n
,
device
=
"cuda"
,
dtype
=
dtype
)
manager
.
init_random_lora
(
module_name
+
"q"
,
weight_q
,
rank
=
rank
)
lora_q
=
manager
.
get_module_lora
(
module_name
+
"q"
)
manager
.
init_random_lora
(
module_name
+
"k"
,
weight_kv
,
rank
=
rank
)
lora_k
=
manager
.
get_module_lora
(
module_name
+
"k"
)
manager
.
init_random_lora
(
module_name
+
"v"
,
weight_kv
,
rank
=
rank
)
lora_v
=
manager
.
get_module_lora
(
module_name
+
"v"
)
input
=
torch
.
rand
(
k
,
n
,
device
=
"cuda"
,
dtype
=
dtype
)
expected
=
torch
.
cat
([
input
@
lora_q
.
lora_a
@
lora_q
.
lora_b
*
lora_q
.
scaling
,
input
@
lora_k
.
lora_a
@
lora_k
.
lora_b
*
lora_k
.
scaling
,
input
@
lora_v
.
lora_a
@
lora_v
.
lora_b
*
lora_v
.
scaling
],
dim
=
1
)
lora_a_stacks
=
[
torch
.
zeros
(
8
,
1
,
lora_q
.
lora_a
.
shape
[
1
],
lora_q
.
lora_a
.
shape
[
0
],
device
=
"cuda"
,
dtype
=
dtype
)
]
+
[
torch
.
zeros
(
8
,
1
,
lora_k
.
lora_a
.
shape
[
1
],
lora_k
.
lora_a
.
shape
[
0
],
device
=
"cuda"
,
dtype
=
dtype
)
for
i
in
range
(
2
)
]
lora_b_stacks
=
[
torch
.
zeros
(
8
,
1
,
lora_q
.
lora_b
.
shape
[
1
],
lora_q
.
lora_b
.
shape
[
0
],
device
=
"cuda"
,
dtype
=
dtype
)
]
+
[
torch
.
zeros
(
8
,
1
,
lora_k
.
lora_b
.
shape
[
1
],
lora_k
.
lora_b
.
shape
[
0
],
device
=
"cuda"
,
dtype
=
dtype
)
for
i
in
range
(
2
)
]
for
i
in
range
(
lora_a_stacks
[
0
].
shape
[
0
]):
lora_a_stacks
[
0
][
i
][
0
]
=
lora_q
.
lora_a
.
T
lora_b_stacks
[
0
][
i
][
0
]
=
(
lora_q
.
lora_b
*
lora_q
.
scaling
).
T
lora_a_stacks
[
1
][
i
][
0
]
=
lora_k
.
lora_a
.
T
lora_b_stacks
[
1
][
i
][
0
]
=
(
lora_k
.
lora_b
*
lora_k
.
scaling
).
T
lora_a_stacks
[
2
][
i
][
0
]
=
lora_v
.
lora_a
.
T
lora_b_stacks
[
2
][
i
][
0
]
=
(
lora_v
.
lora_b
*
lora_v
.
scaling
).
T
output
=
torch
.
zeros
(
k
,
sum
(
qkv
),
device
=
"cuda"
,
dtype
=
dtype
)
_apply_lora_packed_nslice
(
input
,
lora_a_stacks
,
lora_b_stacks
,
torch
.
randint
(
0
,
lora_a_stacks
[
0
].
shape
[
0
],
(
len
(
input
),
),
device
=
"cuda"
),
output
,
(
qkv
[
0
],
qkv
[
1
],
qkv
[
2
]))
rtol
,
atol
=
TOLERANCES
[
dtype
]
assert
torch
.
allclose
(
expected
,
output
,
rtol
=
rtol
,
atol
=
atol
)
output
[:]
=
0
_apply_lora_packed_nslice
(
input
,
lora_a_stacks
,
lora_b_stacks
,
torch
.
full
((
len
(
input
),
),
-
1
,
device
=
"cuda"
),
output
,
(
qkv
[
0
],
qkv
[
1
],
qkv
[
2
]))
assert
torch
.
allclose
(
torch
.
zeros_like
(
output
),
output
)
manager
.
reset_lora
()
tests/lora/test_punica.py
deleted
100644 → 0
View file @
6b16ea2e
# Based on code from https://github.com/punica-ai/punica
import
pytest
import
torch
import
vllm.lora.punica
as
punica
def
assert_close
(
a
,
b
):
rtol
,
atol
=
{
torch
.
float16
:
(
5e-3
,
5e-3
),
torch
.
bfloat16
:
(
3e-2
,
2e-2
),
torch
.
float32
:
(
None
,
None
),
}[
a
.
dtype
]
torch
.
testing
.
assert_close
(
a
,
b
,
rtol
=
rtol
,
atol
=
atol
)
def
_lora_ref_impl
(
y_final
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
wa_T_all
:
torch
.
Tensor
,
wb_T_all
:
torch
.
Tensor
,
indicies
:
torch
.
LongTensor
,
layer_idx
:
int
,
scale
:
float
,
):
y_stage_1
=
torch
.
empty
(
(
x
.
size
(
0
),
wa_T_all
.
size
(
-
2
)),
dtype
=
torch
.
float32
,
device
=
x
.
device
,
)
bs
=
x
.
shape
[
0
]
s
=
torch
.
tensor
(
scale
,
dtype
=
torch
.
float32
,
device
=
x
.
device
)
for
i
,
lora_idx
in
zip
(
range
(
bs
),
indicies
.
cpu
().
tolist
()):
xi
=
x
[
i
].
unsqueeze
(
0
).
to
(
torch
.
float32
)
wa
=
wa_T_all
[
lora_idx
,
layer_idx
].
transpose
(
-
1
,
-
2
).
to
(
torch
.
float32
)
if
wb_T_all
is
not
None
:
wb
=
wb_T_all
[
lora_idx
,
layer_idx
].
transpose
(
-
1
,
-
2
).
to
(
torch
.
float32
)
tmp
=
xi
@
wa
y_stage_1
[
i
]
=
tmp
.
squeeze
(
0
)
y_final
[
i
]
+=
((
tmp
@
wb
).
squeeze
(
0
)
*
s
if
wb_T_all
is
not
None
else
y_stage_1
[
i
])
return
y_final
,
y_stage_1
H1
=
H2
=
[
128
,
256
,
512
,
896
,
1024
,
1152
,
1216
,
1280
,
1536
,
1664
,
2048
,
2240
,
2304
,
2368
,
2432
,
2560
,
2752
,
3072
,
3328
,
3456
,
3584
,
3712
,
4096
,
4480
,
4608
,
4736
,
4864
,
5120
,
5504
,
5632
,
5888
,
6144
,
6400
,
6848
,
6912
,
7168
,
7424
,
8192
,
8960
,
9216
,
9472
,
10240
,
11008
,
11264
,
13824
,
14336
,
14784
,
14848
,
15360
,
18944
,
22016
,
22528
,
24576
,
27392
,
27648
,
29568
,
29696
,
32000
,
32256
,
32512
,
32768
,
33024
,
36864
,
43264
,
49152
,
49408
,
60544
,
60672
,
64000
,
64256
,
102400
,
102656
,
128000
,
128256
,
]
H2
=
[
64
]
+
H2
R
=
[
1
,
2
,
4
]
SEED
=
[
0xabcdabcd987
]
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
@
pytest
.
mark
.
parametrize
(
"dtype_str"
,
[
"float16"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"h1"
,
H1
)
@
pytest
.
mark
.
parametrize
(
"r"
,
R
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEED
)
@
torch
.
inference_mode
()
def
test_lora_a_extra_shapes
(
dtype_str
,
h1
,
r
,
seed
):
torch
.
manual_seed
(
seed
)
num_loras
=
4
num_layers
=
1
bs
=
32
dtype
=
getattr
(
torch
,
dtype_str
)
device
=
torch
.
device
(
"cuda"
)
wa_T_all
=
torch
.
randn
(
num_loras
,
num_layers
,
r
,
h1
,
dtype
=
dtype
,
device
=
device
)
indices
=
torch
.
randint
(
num_loras
,
(
bs
,
),
dtype
=
torch
.
long
,
device
=
device
)
for
layer_idx
in
range
(
num_layers
):
x
=
torch
.
randn
(
bs
,
h1
,
dtype
=
dtype
,
device
=
device
)
y
=
torch
.
randn
(
bs
,
r
,
dtype
=
dtype
,
device
=
device
)
y_ref
=
y
.
clone
()
_lora_ref_impl
(
y_ref
,
x
,
wa_T_all
,
None
,
indices
,
layer_idx
,
1.0
,
)
y_our
=
y
.
clone
()
punica
.
bgmv
(
y_our
,
x
,
wa_T_all
,
indices
,
layer_idx
,
1.0
)
assert_close
(
y_ref
,
y_our
)
@
pytest
.
mark
.
parametrize
(
"dtype_str"
,
[
"float16"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"h1"
,
H1
)
@
pytest
.
mark
.
parametrize
(
"h2"
,
H2
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEED
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_lora_correctness
(
dtype_str
,
h1
,
h2
,
seed
,
device
):
torch
.
manual_seed
(
seed
)
num_loras
=
4
num_layers
=
1
r
=
8
bs
=
32
scale
=
0.123
dtype
=
getattr
(
torch
,
dtype_str
)
torch
.
set_default_device
(
device
)
wa_T_all
=
torch
.
randn
(
num_loras
,
num_layers
,
r
,
h1
,
dtype
=
dtype
)
wb_T_all
=
torch
.
randn
(
num_loras
,
num_layers
,
h2
,
r
,
dtype
=
dtype
)
indices
=
torch
.
randint
(
num_loras
,
(
bs
,
),
dtype
=
torch
.
long
)
for
layer_idx
in
range
(
num_layers
):
x
=
torch
.
randn
(
bs
,
h1
,
dtype
=
dtype
)
y
=
torch
.
randn
(
bs
,
h2
,
dtype
=
dtype
)
y_ref
=
y
.
clone
()
_lora_ref_impl
(
y_ref
,
x
,
wa_T_all
,
wb_T_all
,
indices
,
layer_idx
,
scale
)
y_our
=
y
.
clone
()
punica
.
add_lora
(
y_our
,
x
,
wa_T_all
,
wb_T_all
,
indices
,
layer_idx
,
scale
)
assert_close
(
y_ref
,
y_our
)
@
pytest
.
mark
.
parametrize
(
"dtype_str"
,
[
"float16"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"h1"
,
H1
)
@
pytest
.
mark
.
parametrize
(
"h2"
,
H2
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEED
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_lora_correctness_slice
(
dtype_str
,
h1
,
h2
,
seed
,
device
):
if
h2
%
3
!=
0
or
h2
//
3
not
in
H1
:
pytest
.
skip
(
"h2 must be divisible by 3 and in supported shapes"
)
torch
.
manual_seed
(
seed
)
num_loras
=
4
num_layers
=
1
r
=
8
bs
=
32
scale
=
0.123
dtype
=
getattr
(
torch
,
dtype_str
)
torch
.
set_default_device
(
device
)
wa_T_all_0
=
torch
.
randn
(
num_loras
,
num_layers
,
r
,
h1
,
dtype
=
dtype
)
wa_T_all_1
=
torch
.
randn
(
num_loras
,
num_layers
,
r
,
h1
,
dtype
=
dtype
)
wa_T_all_2
=
torch
.
randn
(
num_loras
,
num_layers
,
r
,
h1
,
dtype
=
dtype
)
wb_T_all_0
=
torch
.
randn
(
num_loras
,
num_layers
,
h2
//
3
,
r
,
dtype
=
dtype
)
wb_T_all_1
=
torch
.
randn
(
num_loras
,
num_layers
,
h2
//
3
,
r
,
dtype
=
dtype
)
wb_T_all_2
=
torch
.
randn
(
num_loras
,
num_layers
,
h2
//
3
,
r
,
dtype
=
dtype
)
indices
=
torch
.
randint
(
num_loras
,
(
bs
,
),
dtype
=
torch
.
long
)
for
layer_idx
in
range
(
num_layers
):
x
=
torch
.
randn
(
bs
,
h1
,
dtype
=
dtype
)
y
=
torch
.
randn
(
bs
,
h2
,
dtype
=
dtype
)
s
=
h2
//
3
y_ref
=
y
.
clone
()
_lora_ref_impl
(
y_ref
[:,
:
s
],
x
,
wa_T_all_0
,
wb_T_all_0
,
indices
,
layer_idx
,
scale
)
_lora_ref_impl
(
y_ref
[:,
s
:
s
*
2
],
x
,
wa_T_all_1
,
wb_T_all_1
,
indices
,
layer_idx
,
scale
)
_lora_ref_impl
(
y_ref
[:,
s
*
2
:],
x
,
wa_T_all_2
,
wb_T_all_2
,
indices
,
layer_idx
,
scale
)
y_our
=
y
.
clone
()
punica
.
add_lora_slice
(
y_our
,
x
,
wa_T_all_0
,
wb_T_all_0
,
indices
,
layer_idx
,
scale
,
0
,
s
)
punica
.
add_lora_slice
(
y_our
,
x
,
wa_T_all_1
,
wb_T_all_1
,
indices
,
layer_idx
,
scale
,
s
,
s
)
punica
.
add_lora_slice
(
y_our
,
x
,
wa_T_all_2
,
wb_T_all_2
,
indices
,
layer_idx
,
scale
,
s
*
2
,
s
)
assert_close
(
y_ref
[:,
:
s
],
y_our
[:,
:
s
])
assert_close
(
y_ref
[:,
s
:
s
*
2
],
y_our
[:,
s
:
s
*
2
])
assert_close
(
y_ref
[:,
s
*
2
:],
y_our
[:,
s
*
2
:])
tests/lora/test_punica_sizes.py
0 → 100644
View file @
e661d594
"""
This script is mainly used to tests various hidden_sizes. We have collected the
hidden_sizes included in the LoRA models currently supported by vLLM. It tests
whether the corresponding Triton kernel can run normally when tensor parallelism
is set to [1, 2, 4, 8, 16, 32, 64].
"""
import
random
from
unittest.mock
import
patch
import
pytest
import
torch
from
vllm.lora.ops.bgmv_expand
import
bgmv_expand
from
vllm.lora.ops.bgmv_expand_slice
import
bgmv_expand_slice
from
vllm.lora.ops.bgmv_shrink
import
bgmv_shrink
from
vllm.lora.ops.sgmv_expand
import
sgmv_expand
from
vllm.lora.ops.sgmv_expand_slice
import
sgmv_expand_slice
from
vllm.lora.ops.sgmv_shrink
import
sgmv_shrink
from
vllm.triton_utils.libentry
import
LibEntry
from
.utils
import
(
generate_data
,
generate_data_for_expand_nslices
,
ref_torch_groupgemm
)
HIDDEN_SIZES
=
[
128
,
256
,
512
,
896
,
1024
,
1152
,
1216
,
1280
,
1536
,
1664
,
2048
,
2240
,
2304
,
2368
,
2432
,
2560
,
2752
,
3072
,
3328
,
3456
,
3584
,
3712
,
4096
,
4480
,
4608
,
4736
,
4864
,
5120
,
5504
,
5632
,
5888
,
6144
,
6400
,
6848
,
6912
,
7168
,
7424
,
8192
,
8960
,
9216
,
9472
,
10240
,
11008
,
11264
,
13824
,
14336
,
14784
,
14848
,
15360
,
18944
,
22016
,
22528
,
24576
,
27392
,
27648
,
29568
,
29696
,
32000
,
32256
,
32512
,
32768
,
33024
,
36864
,
43264
,
49152
,
49408
,
60544
,
60672
,
64000
,
64256
,
102400
,
102656
,
128000
,
128256
,
]
#The size of TP
divisibility
=
[
1
,
2
,
4
,
8
,
16
,
32
,
64
]
all_hidden_size
=
[]
for
div
in
divisibility
:
for
hidden_size
in
HIDDEN_SIZES
:
all_hidden_size
.
append
(
hidden_size
//
div
)
HIDDEN_SIZES
=
list
(
set
(
all_hidden_size
))
BATCHES
=
[
4
]
NUM_LORA
=
[
4
]
DTYPES
=
[
torch
.
float16
,
torch
.
bfloat16
]
MAX_RANKS
=
[
32
]
SCALES
=
[
0.5
]
SEED
=
[
0
]
CUDA_DEVICES
=
[
f
"cuda:
{
0
}
"
]
def
assert_close
(
a
,
b
):
rtol
,
atol
=
{
torch
.
float16
:
(
6e-2
,
6e-2
),
torch
.
bfloat16
:
(
6e-2
,
6e-2
),
torch
.
float32
:
(
1e-2
,
1e-2
),
}[
a
.
dtype
]
torch
.
testing
.
assert_close
(
a
,
b
,
rtol
=
rtol
,
atol
=
atol
)
@
pytest
.
mark
.
parametrize
(
"batches"
,
BATCHES
)
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
NUM_LORA
)
@
pytest
.
mark
.
parametrize
(
"rank"
,
MAX_RANKS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"scaling"
,
SCALES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"op_type"
,
[
"shrink"
,
"expand"
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEED
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_punica_sgmv
(
batches
:
int
,
num_loras
:
int
,
rank
:
int
,
hidden_size
:
int
,
scaling
:
float
,
dtype
:
torch
.
dtype
,
op_type
:
str
,
seed
:
int
,
device
:
str
,
):
random
.
seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seq_length
=
128
(
inputs_tensor
,
lora_weights
,
our_out_tensor
,
ref_out_tensor
,
b_seq_start_loc
,
lora_indices_tensor
,
seq_len_tensor
,
indices
,
)
=
generate_data
(
batches
,
hidden_size
,
num_loras
,
rank
,
seq_length
,
dtype
,
op_type
,
device
,
)
max_seq_length
=
seq_len_tensor
.
max
()
if
isinstance
(
max_seq_length
,
tuple
):
max_seq_length
=
max_seq_length
[
0
].
item
()
else
:
max_seq_length
=
max_seq_length
.
item
()
if
op_type
==
"shrink"
:
sgmv_shrink
(
inputs_tensor
,
lora_weights
,
our_out_tensor
,
b_seq_start_loc
,
seq_len_tensor
,
lora_indices_tensor
,
batches
,
max_seq_length
,
scaling
,
)
else
:
sgmv_expand
(
inputs_tensor
,
lora_weights
,
our_out_tensor
,
b_seq_start_loc
,
seq_len_tensor
,
lora_indices_tensor
,
batches
,
max_seq_length
,
add_inputs
=
True
,
)
ref_torch_groupgemm
(
ref_out_tensor
,
inputs_tensor
,
lora_weights
,
lora_indices_tensor
,
seq_len_tensor
,
batches
,
scaling
if
op_type
==
"shrink"
else
1.0
,
op_type
,
)
if
op_type
==
"shrink"
:
ref_out_tensor
=
ref_out_tensor
.
to
(
torch
.
float32
)
assert_close
(
our_out_tensor
,
ref_out_tensor
)
@
pytest
.
mark
.
parametrize
(
"batches"
,
BATCHES
)
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
NUM_LORA
)
@
pytest
.
mark
.
parametrize
(
"rank"
,
MAX_RANKS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"scaling"
,
SCALES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"op_type"
,
[
"shrink"
,
"expand"
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEED
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_punica_bgmv
(
batches
:
int
,
num_loras
:
int
,
rank
:
int
,
hidden_size
:
int
,
scaling
:
float
,
dtype
:
torch
.
dtype
,
op_type
:
str
,
seed
:
int
,
device
:
str
,
):
from
vllm.lora.ops.bgmv_expand
import
_bgmv_expand_kernel
from
vllm.lora.ops.bgmv_shrink
import
_bgmv_shrink_kernel
random
.
seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seq_length
=
1
(
inputs_tensor
,
lora_weights
,
our_out_tensor
,
ref_out_tensor
,
b_seq_start_loc
,
lora_indices_tensor
,
seq_len_tensor
,
indices
,
)
=
generate_data
(
batches
,
hidden_size
,
num_loras
,
rank
,
seq_length
,
dtype
,
op_type
,
device
,
)
if
op_type
==
"shrink"
:
# The current _bgmv_shrink_kernel does not require the libentry
# decoration. The purpose of adding this patch is to test the
# correctness of libentry.
with
patch
(
"vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel"
,
LibEntry
(
_bgmv_shrink_kernel
),
):
bgmv_shrink
(
inputs_tensor
,
lora_weights
,
our_out_tensor
,
indices
,
scaling
,
)
else
:
# ditto
with
patch
(
"vllm.lora.ops.bgmv_expand._bgmv_expand_kernel"
,
LibEntry
(
_bgmv_expand_kernel
),
):
bgmv_expand
(
inputs_tensor
,
lora_weights
,
our_out_tensor
,
indices
,
add_inputs
=
True
,
)
ref_torch_groupgemm
(
ref_out_tensor
,
inputs_tensor
,
lora_weights
,
lora_indices_tensor
,
seq_len_tensor
,
batches
,
scaling
if
op_type
==
"shrink"
else
1.0
,
op_type
,
)
if
op_type
==
"shrink"
:
ref_out_tensor
=
ref_out_tensor
.
to
(
torch
.
float32
)
assert_close
(
our_out_tensor
,
ref_out_tensor
)
@
pytest
.
mark
.
parametrize
(
"batches"
,
BATCHES
)
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
NUM_LORA
)
@
pytest
.
mark
.
parametrize
(
"rank"
,
MAX_RANKS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"nslices"
,
[
2
,
3
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"op_type"
,
[
"sgmv"
,
"bgmv"
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEED
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_punica_expand_nslices
(
batches
:
int
,
num_loras
:
int
,
rank
:
int
,
hidden_size
:
int
,
nslices
:
int
,
dtype
:
torch
.
dtype
,
op_type
:
str
,
seed
:
int
,
device
:
str
,
):
from
vllm.lora.ops.bgmv_expand_slice
import
_bgmv_expand_slice_kernel
random
.
seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seq_length
=
128
if
op_type
==
"sgmv"
else
1
(
inputs_tensor
,
lora_weights_lst
,
our_outputs
,
ref_outputs
,
b_seq_start_loc
,
lora_indices_tensor
,
seq_len_tensor
,
indices
,
)
=
generate_data_for_expand_nslices
(
batches
,
hidden_size
,
num_loras
,
rank
,
seq_length
,
dtype
,
nslices
,
device
,
)
max_seq_length
=
seq_len_tensor
.
max
()
if
isinstance
(
max_seq_length
,
tuple
):
max_seq_length
=
max_seq_length
[
0
].
item
()
else
:
max_seq_length
=
max_seq_length
.
item
()
slice_offset
=
0
for
index
in
range
(
nslices
):
lora_weights
=
lora_weights_lst
[
index
]
if
op_type
==
"sgmv"
:
sgmv_expand_slice
(
inputs_tensor
,
lora_weights
,
our_outputs
,
b_seq_start_loc
,
seq_len_tensor
,
lora_indices_tensor
,
batches
,
max_seq_length
,
slice_offset
,
hidden_size
,
add_inputs
=
True
,
)
else
:
# The current _bgmv_expand_slice_kernel does not require the
# libentry decoration. The purpose of adding this patch is to test
# the correctness of libentry.
with
patch
(
"vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel"
,
LibEntry
(
_bgmv_expand_slice_kernel
),
):
bgmv_expand_slice
(
inputs_tensor
,
lora_weights
,
our_outputs
,
indices
,
slice_offset
,
slice_size
=
hidden_size
,
add_inputs
=
True
,
)
ref_torch_groupgemm
(
ref_outputs
[:,
slice_offset
:
slice_offset
+
hidden_size
],
inputs_tensor
,
lora_weights
,
lora_indices_tensor
,
seq_len_tensor
,
batches
,
1.0
,
op_type
=
"expand"
,
)
slice_offset
+=
hidden_size
assert_close
(
our_outputs
,
ref_outputs
)
tests/lora/test_punica_variation.py
0 → 100644
View file @
e661d594
"""
This script is mainly used to test whether trtion kernels can run normally
under different conditions, including various batches, numbers of LoRA , and
maximum ranks.
"""
import
random
from
unittest.mock
import
patch
import
pytest
import
torch
from
vllm.lora.ops.bgmv_expand
import
bgmv_expand
from
vllm.lora.ops.bgmv_expand_slice
import
bgmv_expand_slice
from
vllm.lora.ops.bgmv_shrink
import
bgmv_shrink
from
vllm.lora.ops.sgmv_expand
import
sgmv_expand
from
vllm.lora.ops.sgmv_expand_slice
import
sgmv_expand_slice
from
vllm.lora.ops.sgmv_shrink
import
sgmv_shrink
from
vllm.triton_utils.libentry
import
LibEntry
from
.utils
import
(
generate_data
,
generate_data_for_expand_nslices
,
ref_torch_groupgemm
)
HIDDEN_SIZES
=
[
3424
,
4096
,
4097
]
BATCHES
=
[
1
,
4
,
16
,
32
]
NUM_LORA
=
[
1
,
4
,
8
,
16
,
32
,
64
,
128
]
DTYPES
=
[
torch
.
float16
,
torch
.
bfloat16
]
MAX_RANKS
=
[
1
,
4
,
8
,
16
,
32
,
64
,
128
]
SCALES
=
[
0.5
]
SEED
=
[
0
]
CUDA_DEVICES
=
[
f
"cuda:
{
0
}
"
]
def
assert_close
(
a
,
b
):
rtol
,
atol
=
{
torch
.
float16
:
(
6e-2
,
6e-2
),
torch
.
bfloat16
:
(
6e-2
,
6e-2
),
torch
.
float32
:
(
1e-2
,
1e-2
),
}[
a
.
dtype
]
torch
.
testing
.
assert_close
(
a
,
b
,
rtol
=
rtol
,
atol
=
atol
)
@
pytest
.
mark
.
parametrize
(
"batches"
,
BATCHES
)
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
NUM_LORA
)
@
pytest
.
mark
.
parametrize
(
"rank"
,
MAX_RANKS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"scaling"
,
SCALES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"op_type"
,
[
"shrink"
,
"expand"
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEED
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_punica_sgmv
(
batches
:
int
,
num_loras
:
int
,
rank
:
int
,
hidden_size
:
int
,
scaling
:
float
,
dtype
:
torch
.
dtype
,
op_type
:
str
,
seed
:
int
,
device
:
str
,
):
random
.
seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seq_length
=
128
(
inputs_tensor
,
lora_weights
,
our_out_tensor
,
ref_out_tensor
,
b_seq_start_loc
,
lora_indices_tensor
,
seq_len_tensor
,
indices
,
)
=
generate_data
(
batches
,
hidden_size
,
num_loras
,
rank
,
seq_length
,
dtype
,
op_type
,
device
,
)
max_seq_length
=
seq_len_tensor
.
max
()
if
isinstance
(
max_seq_length
,
tuple
):
max_seq_length
=
max_seq_length
[
0
].
item
()
else
:
max_seq_length
=
max_seq_length
.
item
()
if
op_type
==
"shrink"
:
sgmv_shrink
(
inputs_tensor
,
lora_weights
,
our_out_tensor
,
b_seq_start_loc
,
seq_len_tensor
,
lora_indices_tensor
,
batches
,
max_seq_length
,
scaling
,
)
else
:
sgmv_expand
(
inputs_tensor
,
lora_weights
,
our_out_tensor
,
b_seq_start_loc
,
seq_len_tensor
,
lora_indices_tensor
,
batches
,
max_seq_length
,
add_inputs
=
True
,
)
ref_torch_groupgemm
(
ref_out_tensor
,
inputs_tensor
,
lora_weights
,
lora_indices_tensor
,
seq_len_tensor
,
batches
,
scaling
if
op_type
==
"shrink"
else
1.0
,
op_type
,
)
if
op_type
==
"shrink"
:
ref_out_tensor
=
ref_out_tensor
.
to
(
torch
.
float32
)
assert_close
(
our_out_tensor
,
ref_out_tensor
)
@
pytest
.
mark
.
parametrize
(
"batches"
,
BATCHES
)
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
NUM_LORA
)
@
pytest
.
mark
.
parametrize
(
"rank"
,
MAX_RANKS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"scaling"
,
SCALES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"op_type"
,
[
"shrink"
,
"expand"
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEED
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_punica_bgmv
(
batches
:
int
,
num_loras
:
int
,
rank
:
int
,
hidden_size
:
int
,
scaling
:
float
,
dtype
:
torch
.
dtype
,
op_type
:
str
,
seed
:
int
,
device
:
str
,
):
from
vllm.lora.ops.bgmv_expand
import
_bgmv_expand_kernel
from
vllm.lora.ops.bgmv_shrink
import
_bgmv_shrink_kernel
random
.
seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seq_length
=
1
(
inputs_tensor
,
lora_weights
,
our_out_tensor
,
ref_out_tensor
,
b_seq_start_loc
,
lora_indices_tensor
,
seq_len_tensor
,
indices
,
)
=
generate_data
(
batches
,
hidden_size
,
num_loras
,
rank
,
seq_length
,
dtype
,
op_type
,
device
,
)
if
op_type
==
"shrink"
:
# The current _bgmv_shrink_kernel does not require the libentry
# decoration. The purpose of adding this patch is to test the
# correctness of libentry.
with
patch
(
"vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel"
,
LibEntry
(
_bgmv_shrink_kernel
),
):
bgmv_shrink
(
inputs_tensor
,
lora_weights
,
our_out_tensor
,
indices
,
scaling
,
)
else
:
# ditto
with
patch
(
"vllm.lora.ops.bgmv_expand._bgmv_expand_kernel"
,
LibEntry
(
_bgmv_expand_kernel
),
):
bgmv_expand
(
inputs_tensor
,
lora_weights
,
our_out_tensor
,
indices
,
add_inputs
=
True
,
)
ref_torch_groupgemm
(
ref_out_tensor
,
inputs_tensor
,
lora_weights
,
lora_indices_tensor
,
seq_len_tensor
,
batches
,
scaling
if
op_type
==
"shrink"
else
1.0
,
op_type
,
)
if
op_type
==
"shrink"
:
ref_out_tensor
=
ref_out_tensor
.
to
(
torch
.
float32
)
assert_close
(
our_out_tensor
,
ref_out_tensor
)
@
pytest
.
mark
.
parametrize
(
"batches"
,
BATCHES
)
@
pytest
.
mark
.
parametrize
(
"num_loras"
,
NUM_LORA
)
@
pytest
.
mark
.
parametrize
(
"rank"
,
MAX_RANKS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"nslices"
,
[
2
,
3
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"op_type"
,
[
"sgmv"
,
"bgmv"
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEED
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_punica_expand_nslices
(
batches
:
int
,
num_loras
:
int
,
rank
:
int
,
hidden_size
:
int
,
nslices
:
int
,
dtype
:
torch
.
dtype
,
op_type
:
str
,
seed
:
int
,
device
:
str
,
):
from
vllm.lora.ops.bgmv_expand_slice
import
_bgmv_expand_slice_kernel
random
.
seed
(
seed
)
torch
.
set_default_device
(
device
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
seq_length
=
128
if
op_type
==
"sgmv"
else
1
(
inputs_tensor
,
lora_weights_lst
,
our_outputs
,
ref_outputs
,
b_seq_start_loc
,
lora_indices_tensor
,
seq_len_tensor
,
indices
,
)
=
generate_data_for_expand_nslices
(
batches
,
hidden_size
,
num_loras
,
rank
,
seq_length
,
dtype
,
nslices
,
device
,
)
max_seq_length
=
seq_len_tensor
.
max
()
if
isinstance
(
max_seq_length
,
tuple
):
max_seq_length
=
max_seq_length
[
0
].
item
()
else
:
max_seq_length
=
max_seq_length
.
item
()
slice_offset
=
0
for
index
in
range
(
nslices
):
lora_weights
=
lora_weights_lst
[
index
]
if
op_type
==
"sgmv"
:
sgmv_expand_slice
(
inputs_tensor
,
lora_weights
,
our_outputs
,
b_seq_start_loc
,
seq_len_tensor
,
lora_indices_tensor
,
batches
,
max_seq_length
,
slice_offset
,
hidden_size
,
add_inputs
=
True
,
)
else
:
# The current _bgmv_expand_slice_kernel does not require the
# libentry decoration. The purpose of adding this patch is to test
# the correctness of libentry.
with
patch
(
"vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel"
,
LibEntry
(
_bgmv_expand_slice_kernel
),
):
bgmv_expand_slice
(
inputs_tensor
,
lora_weights
,
our_outputs
,
indices
,
slice_offset
,
slice_size
=
hidden_size
,
add_inputs
=
True
,
)
ref_torch_groupgemm
(
ref_outputs
[:,
slice_offset
:
slice_offset
+
hidden_size
],
inputs_tensor
,
lora_weights
,
lora_indices_tensor
,
seq_len_tensor
,
batches
,
1.0
,
op_type
=
"expand"
,
)
slice_offset
+=
hidden_size
assert_close
(
our_outputs
,
ref_outputs
)
if
__name__
==
"__main__"
:
from
itertools
import
product
lst
=
list
(
product
(
BATCHES
,
NUM_LORA
,
MAX_RANKS
,
[
1.0
],
[
torch
.
float16
],
[
"expand"
],
SEED
,
CUDA_DEVICES
,
))
for
ele
in
lst
:
test_punica_bgmv
(
*
ele
)
print
(
f
"
{
ele
}
,pass"
)
tests/lora/test_quant_model.py
View file @
e661d594
...
...
@@ -64,14 +64,16 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
# if torch.cuda.device_count() < tp_size:
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
llm
=
vllm
.
LLM
(
model
=
model
.
model_path
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
4
,
max_model_len
=
400
,
tensor_parallel_size
=
tp_size
,
quantization
=
model
.
quantization
,
trust_remote_code
=
True
)
llm
=
vllm
.
LLM
(
model
=
model
.
model_path
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
4
,
max_model_len
=
400
,
tensor_parallel_size
=
tp_size
,
gpu_memory_utilization
=
0.2
,
#avoid OOM
quantization
=
model
.
quantization
,
trust_remote_code
=
True
)
if
model
.
quantization
is
None
:
expected_no_lora_output
=
[
...
...
@@ -156,24 +158,28 @@ def test_quant_model_tp_equality(tinyllama_lora_files, model):
# if torch.cuda.device_count() < 2:
# pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
llm_tp1
=
vllm
.
LLM
(
model
=
model
.
model_path
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
4
,
tensor_parallel_size
=
1
,
quantization
=
model
.
quantization
,
trust_remote_code
=
True
)
llm_tp1
=
vllm
.
LLM
(
model
=
model
.
model_path
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
4
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.2
,
#avoid OOM
quantization
=
model
.
quantization
,
trust_remote_code
=
True
)
output_tp1
=
do_sample
(
llm_tp1
,
tinyllama_lora_files
,
lora_id
=
1
)
del
llm_tp1
cleanup
()
llm_tp2
=
vllm
.
LLM
(
model
=
model
.
model_path
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
4
,
tensor_parallel_size
=
2
,
quantization
=
model
.
quantization
)
llm_tp2
=
vllm
.
LLM
(
model
=
model
.
model_path
,
enable_lora
=
True
,
max_num_seqs
=
16
,
max_loras
=
4
,
tensor_parallel_size
=
2
,
gpu_memory_utilization
=
0.2
,
#avoid OOM
quantization
=
model
.
quantization
)
output_tp2
=
do_sample
(
llm_tp2
,
tinyllama_lora_files
,
lora_id
=
1
)
del
llm_tp2
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment