Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
500b93c8
Commit
500b93c8
authored
Jul 25, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.3.post1' into v0.5.3.post1-dtk24.04.1
parents
99426767
38c4b7e8
Changes
282
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
861 additions
and
340 deletions
+861
-340
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+45
-122
tests/engine/output_processor/test_stop_checker.py
tests/engine/output_processor/test_stop_checker.py
+2
-2
tests/engine/test_custom_executor.py
tests/engine/test_custom_executor.py
+91
-0
tests/entrypoints/openai/test_basic.py
tests/entrypoints/openai/test_basic.py
+61
-0
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+24
-29
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+115
-95
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+11
-11
tests/entrypoints/openai/test_models.py
tests/entrypoints/openai/test_models.py
+21
-21
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+5
-3
tests/entrypoints/openai/test_tokenization.py
tests/entrypoints/openai/test_tokenization.py
+152
-0
tests/entrypoints/openai/test_vision.py
tests/entrypoints/openai/test_vision.py
+15
-17
tests/kernels/quant_utils.py
tests/kernels/quant_utils.py
+72
-0
tests/kernels/test_attention.py
tests/kernels/test_attention.py
+5
-3
tests/kernels/test_blocksparse_attention.py
tests/kernels/test_blocksparse_attention.py
+5
-3
tests/kernels/test_cache.py
tests/kernels/test_cache.py
+2
-2
tests/kernels/test_fp8_quant.py
tests/kernels/test_fp8_quant.py
+87
-0
tests/kernels/test_int8_quant.py
tests/kernels/test_int8_quant.py
+10
-16
tests/kernels/test_marlin_gemm.py
tests/kernels/test_marlin_gemm.py
+129
-13
tests/lora/conftest.py
tests/lora/conftest.py
+8
-2
tests/lora/test_long_context.py
tests/lora/test_long_context.py
+1
-1
No files found.
tests/distributed/test_pipeline_parallel.py
View file @
500b93c8
import
os
import
openai
# use the official client for correctness check
import
pytest
from
..utils
import
RemoteOpenAIServer
from
..utils
import
compare_two_settings
# downloading lora to test lora requests
VLLM_MULTI_NODE
=
os
.
getenv
(
"VLLM_MULTI_NODE"
,
"0"
)
==
"1"
# any model with a chat template should work here
MODEL_NAME
=
"meta-llama/Meta-Llama-3-8B"
EAGER_MODE
=
bool
(
int
(
os
.
getenv
(
"EAGER_MODE"
,
0
)))
CHUNKED_PREFILL
=
bool
(
int
(
os
.
getenv
(
"CHUNKED_PREFILL"
,
0
)))
TP_SIZE
=
int
(
os
.
getenv
(
"TP_SIZE"
,
1
))
PP_SIZE
=
int
(
os
.
getenv
(
"PP_SIZE"
,
1
))
pytestmark
=
pytest
.
mark
.
asyncio
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--model"
,
MODEL_NAME
,
@
pytest
.
mark
.
parametrize
(
"TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, DIST_BACKEND"
,
[
(
2
,
2
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
2
,
2
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
1
,
3
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
1
,
4
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
1
,
4
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"ray"
),
(
2
,
2
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
2
,
2
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
3
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
4
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
(
1
,
4
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
,
"mp"
),
])
def
test_compare_tp
(
TP_SIZE
,
PP_SIZE
,
EAGER_MODE
,
CHUNKED_PREFILL
,
MODEL_NAME
,
DIST_BACKEND
):
if
VLLM_MULTI_NODE
and
DIST_BACKEND
==
"mp"
:
pytest
.
skip
(
"Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend"
)
pp_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"
b
float16"
,
"float16"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
"--tensor-parallel-size"
,
str
(
TP_SIZE
),
"--distributed-executor-backend"
,
"ray"
,
DIST_BACKEND
,
]
# compare without pipeline parallelism
# NOTE: use mp backend for TP
# PP tests might involve multiple nodes, and ray might
# schedule all workers in a node other than the head node,
# which can cause the test to fail.
tp_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--tensor-parallel-size"
,
str
(
max
(
TP_SIZE
,
2
)),
# We only use 2 GPUs in the CI.
"--distributed-executor-backend"
,
"mp"
,
]
if
CHUNKED_PREFILL
:
args
+=
[
"--enable-chunked-prefill"
,
]
pp_args
.
append
(
"--enable-chunked-prefill"
)
tp_args
.
append
(
"--enable-chunked-prefill"
)
if
EAGER_MODE
:
args
+=
[
"--enforce-eager"
,
]
with
RemoteOpenAIServer
(
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
async
def
test_check_models
(
server
,
client
:
openai
.
AsyncOpenAI
):
models
=
await
client
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
assert
served_model
.
id
==
MODEL_NAME
assert
all
(
model
.
root
==
MODEL_NAME
for
model
in
models
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_single_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
assert
completion
.
choices
[
0
].
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_batch_completions
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test simple list
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
batch
.
choices
)
==
2
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
1
].
text
# test n = 2
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
n
=
2
,
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
assert
batch
.
choices
[
0
].
text
!=
batch
.
choices
[
1
].
text
,
"beam search should be different"
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
2
].
text
,
"two copies of the same prompt should be the same"
assert
batch
.
choices
[
1
].
text
==
batch
.
choices
[
3
].
text
,
"two copies of the same prompt should be the same"
pp_args
.
append
(
"--enforce-eager"
)
tp_args
.
append
(
"--enforce-eager"
)
# test streaming
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
texts
=
[
""
]
*
2
async
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
assert
texts
[
0
]
==
texts
[
1
]
compare_two_settings
(
MODEL_NAME
,
pp_args
,
tp_args
)
tests/engine/output_processor/test_stop_checker.py
View file @
500b93c8
...
...
@@ -35,8 +35,8 @@ def sequence_with_eos(text: str, eos_token: str,
@
pytest
.
mark
.
parametrize
([
"text_wo_eos"
,
"eos_token"
,
"eos_token_id"
],
[
(
"This text ends with EOS token"
,
"</s>"
,
2
),
])
@
pytest
.
mark
.
parametrize
(
"ignore_eos"
,
[
True
,
False
,
None
])
@
pytest
.
mark
.
parametrize
(
"include_stop_str_in_output"
,
[
True
,
False
,
None
])
@
pytest
.
mark
.
parametrize
(
"ignore_eos"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"include_stop_str_in_output"
,
[
True
,
False
])
@
pytest
.
mark
.
skip_global_cleanup
def
test_stop_on_eos_token
(
text_wo_eos
:
str
,
eos_token
:
str
,
eos_token_id
:
int
,
ignore_eos
:
bool
,
include_stop_str_in_output
:
bool
):
...
...
tests/engine/test_custom_executor.py
0 → 100644
View file @
500b93c8
import
asyncio
import
os
import
pytest
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.executor.gpu_executor
import
GPUExecutor
,
GPUExecutorAsync
from
vllm.sampling_params
import
SamplingParams
class
Mock
:
...
class
CustomGPUExecutor
(
GPUExecutor
):
def
execute_model
(
self
,
*
args
,
**
kwargs
):
# Drop marker to show that this was ran
with
open
(
".marker"
,
"w"
):
...
return
super
().
execute_model
(
*
args
,
**
kwargs
)
class
CustomGPUExecutorAsync
(
GPUExecutorAsync
):
async
def
execute_model_async
(
self
,
*
args
,
**
kwargs
):
with
open
(
".marker"
,
"w"
):
...
return
await
super
().
execute_model_async
(
*
args
,
**
kwargs
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor_type_checking
(
model
):
with
pytest
.
raises
(
ValueError
):
engine_args
=
EngineArgs
(
model
=
model
,
distributed_executor_backend
=
Mock
)
LLMEngine
.
from_engine_args
(
engine_args
)
with
pytest
.
raises
(
ValueError
):
engine_args
=
AsyncEngineArgs
(
model
=
model
,
distributed_executor_backend
=
Mock
)
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
with
pytest
.
raises
(
TypeError
):
engine_args
=
AsyncEngineArgs
(
model
=
model
,
distributed_executor_backend
=
CustomGPUExecutor
)
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor
(
model
,
tmpdir
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmpdir
)
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
engine_args
=
EngineArgs
(
model
=
model
,
distributed_executor_backend
=
CustomGPUExecutor
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
engine
.
add_request
(
"0"
,
"foo"
,
sampling_params
)
engine
.
step
()
assert
os
.
path
.
exists
(
".marker"
)
finally
:
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
def
test_custom_executor_async
(
model
,
tmpdir
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmpdir
)
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
engine_args
=
AsyncEngineArgs
(
model
=
model
,
distributed_executor_backend
=
CustomGPUExecutorAsync
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
async
def
t
():
stream
=
await
engine
.
add_request
(
"0"
,
"foo"
,
sampling_params
)
async
for
x
in
stream
:
...
asyncio
.
run
(
t
())
assert
os
.
path
.
exists
(
".marker"
)
finally
:
os
.
chdir
(
cwd
)
tests/entrypoints/openai/test_basic.py
0 → 100644
View file @
500b93c8
from
http
import
HTTPStatus
import
openai
import
pytest
import
requests
from
vllm.version
import
__version__
as
VLLM_VERSION
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
"--max-num-seqs"
,
"128"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
async
def
test_show_version
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
base_url
+
"/version"
)
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"version"
:
VLLM_VERSION
}
@
pytest
.
mark
.
asyncio
async
def
test_check_health
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
base_url
+
"/health"
)
assert
response
.
status_code
==
HTTPStatus
.
OK
@
pytest
.
mark
.
asyncio
async
def
test_log_metrics
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
base_url
+
"/metrics"
)
assert
response
.
status_code
==
HTTPStatus
.
OK
tests/entrypoints/openai/test_chat.py
View file @
500b93c8
...
...
@@ -7,11 +7,11 @@ import jsonschema
import
openai
# use the official client for correctness check
import
pytest
import
torch
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
...utils
import
RemoteOpenAIServer
from
.test_completion
import
zephyr_lora_added_tokens_files
# noqa: F401
from
.test_completion
import
zephyr_lora_files
# noqa: F401
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
...
...
@@ -21,33 +21,28 @@ LORA_NAME = "typeof/zephyr-7b-beta-lora"
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
):
with
RemoteOpenAIServer
([
"--model"
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
"--max-num-seqs"
,
"128"
,
])
as
remote_server
:
def
server
(
zephyr_lora_files
,
zephyr_lora_added_tokens_files
):
# noqa: F811
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_added_tokens_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
"--max-num-seqs"
,
"128"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
...
...
tests/entrypoints/openai/test_completion.py
View file @
500b93c8
# imports for guided decoding tests
import
json
import
re
import
shutil
from
tempfile
import
TemporaryDirectory
from
typing
import
List
import
jsonschema
import
openai
# use the official client for correctness check
import
pytest
import
requests
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
transformers
import
AutoTokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
...
...
@@ -17,9 +19,13 @@ from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically th
is needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
# technically th
ese adapters use a different base model,
#
but we're not testing
generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
PA_NAME
=
"swapnilbp/llama_tweet_ptune"
# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
# need to change to match the prompt adapter
PA_NUM_VIRTUAL_TOKENS
=
8
@
pytest
.
fixture
(
scope
=
"module"
)
...
...
@@ -28,28 +34,58 @@ def zephyr_lora_files():
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
):
with
RemoteOpenAIServer
([
"--model"
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
"--max-num-seqs"
,
"128"
,
])
as
remote_server
:
def
zephyr_lora_added_tokens_files
(
zephyr_lora_files
):
tmp_dir
=
TemporaryDirectory
()
tmp_model_dir
=
f
"
{
tmp_dir
.
name
}
/zephyr"
shutil
.
copytree
(
zephyr_lora_files
,
tmp_model_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added
=
tokenizer
.
add_tokens
([
"vllm1"
,
"vllm2"
,
"vllm3"
],
special_tokens
=
True
)
assert
added
==
3
tokenizer
.
save_pretrained
(
tmp_model_dir
)
yield
tmp_model_dir
tmp_dir
.
cleanup
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_pa_files
():
return
snapshot_download
(
repo_id
=
PA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
,
zephyr_lora_added_tokens_files
,
zephyr_pa_files
):
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--max-num-seqs"
,
"128"
,
"--enforce-eager"
,
# lora config
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_added_tokens_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
# pa config
"--enable-prompt-adapter"
,
"--prompt-adapters"
,
f
"zephyr-pa=
{
zephyr_pa_files
}
"
,
f
"zephyr-pa2=
{
zephyr_pa_files
}
"
,
"--max-prompt-adapters"
,
"2"
,
"--max-prompt-adapter-token"
,
"128"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
...
...
@@ -60,11 +96,14 @@ def client(server):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
# first test base model, then test loras, then test prompt adapters
"model_name,num_virtual_tokens"
,
[(
MODEL_NAME
,
0
),
(
"zephyr-lora"
,
0
),
(
"zephyr-lora2"
,
0
),
(
"zephyr-pa"
,
PA_NUM_VIRTUAL_TOKENS
),
(
"zephyr-pa2"
,
PA_NUM_VIRTUAL_TOKENS
)],
)
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
num_virtual_tokens
:
int
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
...
...
@@ -77,28 +116,58 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
completion_tokens
=
5
,
prompt_tokens
=
6
+
num_virtual_tokens
,
total_tokens
=
11
+
num_virtual_tokens
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
assert
len
(
completion
.
choices
[
0
].
text
)
>=
1
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
"zephyr-lora2"
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should appear in tokenized prompt
assert
completion
.
choices
[
0
].
text
.
startswith
(
"<unk><unk>vllm1vllm2vllm3"
)
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens_base_model
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should not appear in tokenized prompt
assert
"vllm"
not
in
completion
.
choices
[
0
].
text
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
# first test base model, then test loras
, then test prompt adapters
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
,
"zephyr-pa"
,
"zephyr-pa2"
],
)
async
def
test_no_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
...
...
@@ -110,14 +179,14 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
# just test 1 lora
and 1 pa
hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_zero_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
...
...
@@ -133,12 +202,12 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_some_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
...
...
@@ -154,7 +223,7 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_too_many_completion_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
...
...
@@ -162,7 +231,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
...
...
@@ -174,7 +243,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
stream
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
...
...
@@ -199,7 +268,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_completion_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
...
...
@@ -233,7 +302,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
"HuggingFaceH4/
zephyr-
7b-bet
a"
,
"zephyr-
lor
a"
],
[
MODEL_NAME
,
"
zephyr-
lor
a"
,
"zephyr-
p
a"
],
)
async
def
test_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
...
...
@@ -369,9 +438,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_batch_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test both text and token IDs
...
...
@@ -614,51 +682,3 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
prompt
=
"Give an example string that fits this regex"
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_json
=
sample_json_schema
))
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_tokenize
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
,
tokenizer_mode
=
"fast"
)
for
add_special
in
[
False
,
True
]:
prompt
=
"This is a test prompt."
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
add_special
)
response
=
requests
.
post
(
base_url
+
"/tokenize"
,
json
=
{
"add_special_tokens"
:
add_special
,
"model"
:
model_name
,
"prompt"
:
prompt
})
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"tokens"
:
tokens
,
"count"
:
len
(
tokens
),
"max_model_len"
:
8192
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_detokenize
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
base_url
=
str
(
client
.
base_url
)[:
-
3
]
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
,
tokenizer_mode
=
"fast"
)
prompt
=
"This is a test prompt."
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)
response
=
requests
.
post
(
base_url
+
"detokenize"
,
json
=
{
"model"
:
model_name
,
"tokens"
:
tokens
})
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"prompt"
:
prompt
}
tests/entrypoints/openai/test_embedding.py
View file @
500b93c8
...
...
@@ -11,17 +11,17 @@ EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_server
():
with
RemoteOpenAIServer
(
[
"--model"
,
EMBEDDING_MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype
"
,
"bfloat16
"
,
"--enforce-eager
"
,
"--max-model-len
"
,
"8192"
,
"--enforce-eager"
,
]
)
as
remote_server
:
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--enforce-eager
"
,
"--max-model-len
"
,
"8192
"
,
"--enforce-eager
"
,
]
with
RemoteOpenAIServer
(
EMBEDDING_MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
...
...
tests/entrypoints/openai/test_models.py
View file @
500b93c8
...
...
@@ -19,27 +19,27 @@ def zephyr_lora_files():
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
):
with
RemoteOpenAIServer
(
[
"--model"
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype
"
,
"bfloat16
"
,
"--max-model-len
"
,
"8192"
,
"--en
force-eager
"
,
# lora config below
"--enable-lora
"
,
"--lora-modu
les"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_files
}
"
,
"--max-
lora-rank
"
,
"64
"
,
"--max-
cpu-lora
s"
,
"2
"
,
"--max-num-seqs"
,
"128"
,
]
)
as
remote_server
:
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len
"
,
"8192
"
,
"--enforce-eager
"
,
# lora config below
"--en
able-lora
"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_fi
les
}
"
,
"--max-lora-rank
"
,
"64
"
,
"--max-
cpu-loras
"
,
"2
"
,
"--max-
num-seq
s"
,
"128
"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
500b93c8
...
...
@@ -32,11 +32,13 @@ async def _async_serving_chat_init():
model_config
,
served_model_names
=
[
MODEL_NAME
],
response_role
=
"assistant"
,
chat_template
=
CHAT_TEMPLATE
)
chat_template
=
CHAT_TEMPLATE
,
lora_modules
=
None
,
prompt_adapters
=
None
,
request_logger
=
None
)
return
serving_completion
def
test_async_serving_chat_init
():
serving_completion
=
asyncio
.
run
(
_async_serving_chat_init
())
assert
serving_completion
.
tokenizer
is
not
None
assert
serving_completion
.
tokenizer
.
chat_template
==
CHAT_TEMPLATE
assert
serving_completion
.
chat_template
==
CHAT_TEMPLATE
tests/entrypoints/openai/test_tokenization.py
0 → 100644
View file @
500b93c8
import
openai
# use the official client for correctness check
import
pytest
import
requests
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
from
.test_completion
import
zephyr_lora_added_tokens_files
# noqa: F401
from
.test_completion
import
zephyr_lora_files
# noqa: F401
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_added_tokens_files
:
str
):
# noqa: F811
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
"--max-num-seqs"
,
"128"
,
# lora config
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora2=
{
zephyr_lora_added_tokens_files
}
"
,
"--max-lora-rank"
,
"64"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
tokenizer_name
(
model_name
:
str
,
zephyr_lora_added_tokens_files
:
str
):
# noqa: F811
return
zephyr_lora_added_tokens_files
if
(
model_name
==
"zephyr-lora2"
)
else
model_name
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name,tokenizer_name"
,
[(
MODEL_NAME
,
MODEL_NAME
),
(
"zephyr-lora2"
,
"zephyr-lora2"
)],
indirect
=
[
"tokenizer_name"
],
)
async
def
test_tokenize_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
tokenizer_name
:
str
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
for
add_special
in
[
False
,
True
]:
prompt
=
"vllm1 This is a test prompt."
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
add_special
)
response
=
requests
.
post
(
base_url
+
"/tokenize"
,
json
=
{
"add_special_tokens"
:
add_special
,
"model"
:
model_name
,
"prompt"
:
prompt
})
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"tokens"
:
tokens
,
"count"
:
len
(
tokens
),
"max_model_len"
:
8192
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name,tokenizer_name"
,
[(
MODEL_NAME
,
MODEL_NAME
),
(
"zephyr-lora2"
,
"zephyr-lora2"
)],
indirect
=
[
"tokenizer_name"
],
)
async
def
test_tokenize_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
tokenizer_name
:
str
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
for
add_generation
in
[
False
,
True
]:
for
add_special
in
[
False
,
True
]:
conversation
=
[{
"role"
:
"user"
,
"content"
:
"Hi there!"
},
{
"role"
:
"assistant"
,
"content"
:
"Nice to meet you!"
},
{
"role"
:
"user"
,
"content"
:
"Can I ask a question? vllm1"
}]
prompt
=
tokenizer
.
apply_chat_template
(
add_generation_prompt
=
add_generation
,
conversation
=
conversation
,
tokenize
=
False
)
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
add_special
)
response
=
requests
.
post
(
base_url
+
"/tokenize"
,
json
=
{
"add_generation_prompt"
:
add_generation
,
"add_special_tokens"
:
add_special
,
"messages"
:
conversation
,
"model"
:
model_name
})
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"tokens"
:
tokens
,
"count"
:
len
(
tokens
),
"max_model_len"
:
8192
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name,tokenizer_name"
,
[(
MODEL_NAME
,
MODEL_NAME
),
(
"zephyr-lora2"
,
"zephyr-lora2"
)],
indirect
=
[
"tokenizer_name"
],
)
async
def
test_detokenize
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
tokenizer_name
:
str
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
prompt
=
"This is a test prompt. vllm1"
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)
print
(
f
"CALLING
{
base_url
}
FOR
{
model_name
}
"
)
response
=
requests
.
post
(
base_url
+
"/detokenize"
,
json
=
{
"model"
:
model_name
,
"tokens"
:
tokens
})
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"prompt"
:
prompt
}
tests/entrypoints/openai/test_vision.py
View file @
500b93c8
...
...
@@ -2,9 +2,8 @@ from typing import Dict, List
import
openai
import
pytest
import
pytest_asyncio
from
vllm.multimodal.utils
import
ImageFetchAiohttp
,
encode_image_base64
from
vllm.multimodal.utils
import
encode_image_base64
,
fetch_image
from
...utils
import
VLLM_PATH
,
RemoteOpenAIServer
...
...
@@ -23,17 +22,17 @@ TEST_IMAGE_URLS = [
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
with
RemoteOpenAIServer
(
[
"--model
"
,
MODEL_NAME
,
"--dtype
"
,
"bfloat1
6"
,
"--max-model-len
"
,
"4096
"
,
"--enforce-eager"
,
"--chat-template"
,
str
(
LLAVA_CHAT_TEMPLATE
),
]
)
as
remote_server
:
args
=
[
"--dtype
"
,
"bfloat16"
,
"--max-model-len
"
,
"409
6"
,
"--enforce-eager
"
,
"--chat-template
"
,
str
(
LLAVA_CHAT_TEMPLATE
)
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
...
...
@@ -42,11 +41,10 @@ def client(server):
return
server
.
get_async_client
()
@
pytest
_asyncio
.
fixture
(
scope
=
"session"
)
async
def
base64_encoded_image
()
->
Dict
[
str
,
str
]:
@
pytest
.
fixture
(
scope
=
"session"
)
def
base64_encoded_image
()
->
Dict
[
str
,
str
]:
return
{
image_url
:
encode_image_base64
(
await
ImageFetchAiohttp
.
fetch_image
(
image_url
))
image_url
:
encode_image_base64
(
fetch_image
(
image_url
))
for
image_url
in
TEST_IMAGE_URLS
}
...
...
tests/kernels/quant_utils.py
0 → 100644
View file @
500b93c8
from
typing
import
Optional
,
Tuple
,
Union
import
torch
def
as_float32_tensor
(
x
:
Union
[
float
,
torch
.
tensor
])
->
torch
.
tensor
:
return
torch
.
as_tensor
(
x
,
dtype
=
torch
.
float32
,
device
=
'cuda'
)
def
ref_dynamic_per_token_quant
(
x
:
torch
.
tensor
,
quant_dtype
:
torch
.
dtype
,
scale_ub
:
Optional
[
torch
.
tensor
]
=
None
)
\
->
Tuple
[
torch
.
tensor
,
torch
.
tensor
]:
assert
quant_dtype
in
[
torch
.
int8
,
torch
.
float8_e4m3fn
]
if
scale_ub
is
not
None
:
assert
quant_dtype
==
torch
.
float8_e4m3fn
qtype_traits
=
torch
.
iinfo
(
quant_dtype
)
if
quant_dtype
==
torch
.
int8
\
else
torch
.
finfo
(
quant_dtype
)
qtype_max
=
as_float32_tensor
(
qtype_traits
.
max
)
s_1
=
as_float32_tensor
(
1.0
)
s_512
=
as_float32_tensor
(
512.0
)
# For fp8, in order to match the cuda kernel output, we have to do exactly
# the same operations as in the corresponding fp8 kernel to prevent
# rounding errors.
# Compute scales
x_token_max
,
_
=
x
.
abs
().
max
(
dim
=-
1
)
x_token_max
=
as_float32_tensor
(
x_token_max
)
if
scale_ub
is
not
None
:
x_token_max
=
x_token_max
.
clamp
(
max
=
scale_ub
)
scales
=
(
x_token_max
/
qtype_max
)[:,
None
]
# Quant
if
quant_dtype
==
torch
.
int8
:
iscales
=
as_float32_tensor
(
s_1
/
scales
)
torch_out
=
as_float32_tensor
(
x
)
*
iscales
torch_out
=
torch_out
.
round
()
torch_out
=
torch_out
.
clamp
(
qtype_traits
.
min
,
qtype_traits
.
max
).
to
(
quant_dtype
)
else
:
assert
quant_dtype
==
torch
.
float8_e4m3fn
min_scaling_factor
=
s_1
/
(
qtype_max
*
s_512
)
scales
=
scales
.
clamp
(
min
=
min_scaling_factor
)
torch_out
=
as_float32_tensor
(
x
)
/
scales
torch_out
=
torch_out
.
clamp
(
qtype_traits
.
min
,
qtype_traits
.
max
).
to
(
quant_dtype
)
return
torch_out
,
scales
# The int8 version is very similar. Incorporate the int8 version, like in
# ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
# kernel
def
ref_dynamic_per_tensor_fp8_quant
(
x
:
torch
.
tensor
)
\
->
Tuple
[
torch
.
tensor
,
torch
.
tensor
]:
fp8_traits
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
fp8_max
=
as_float32_tensor
(
fp8_traits
.
max
)
one
=
as_float32_tensor
(
1.0
)
# For fp8, in order to match the cuda kernel output, we have to do exactly
# the same operations as in the corresponding fp8 kernel to prevent
# rounding errors.
x_max
=
as_float32_tensor
(
x
.
abs
().
max
())
ref_scale
=
x_max
/
fp8_max
ref_iscale
=
one
/
ref_scale
ref_out
=
(
as_float32_tensor
(
x
)
*
ref_iscale
).
clamp
(
fp8_traits
.
min
,
fp8_traits
.
max
).
to
(
dtype
=
torch
.
float8_e4m3fn
)
return
ref_out
,
ref_scale
tests/kernels/test_attention.py
View file @
500b93c8
...
...
@@ -176,7 +176,7 @@ def test_paged_attention(
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
# Using default kv_scale
kv_scale
=
1.0
k
_scale
=
v_scale
=
1.0
# Call the paged attention kernel.
output
=
torch
.
empty_like
(
query
)
...
...
@@ -194,7 +194,8 @@ def test_paged_attention(
max_seq_len
,
alibi_slopes
,
kv_cache_dtype
,
kv_scale
,
k_scale
,
v_scale
,
)
elif
version
==
"v2"
:
num_partitions
=
((
max_seq_len
+
PARTITION_SIZE
-
1
)
//
PARTITION_SIZE
)
...
...
@@ -225,7 +226,8 @@ def test_paged_attention(
max_seq_len
,
alibi_slopes
,
kv_cache_dtype
,
kv_scale
,
k_scale
,
v_scale
,
)
else
:
raise
AssertionError
(
f
"Unknown version:
{
version
}
"
)
...
...
tests/kernels/test_blocksparse_attention.py
View file @
500b93c8
...
...
@@ -212,7 +212,7 @@ def test_paged_attention(
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
# Using default kv_scale
kv_scale
=
1.0
k
_scale
=
v_scale
=
1.0
tp_rank
=
0
# Call the paged attention kernel.
...
...
@@ -231,7 +231,8 @@ def test_paged_attention(
max_seq_len
,
alibi_slopes
,
kv_cache_dtype
,
kv_scale
,
k_scale
,
v_scale
,
tp_rank
=
tp_rank
,
blocksparse_local_blocks
=
blocksparse_local_blocks
,
blocksparse_vert_stride
=
blocksparse_vert_stride
,
...
...
@@ -267,7 +268,8 @@ def test_paged_attention(
max_seq_len
,
alibi_slopes
,
kv_cache_dtype
,
kv_scale
,
k_scale
,
v_scale
,
tp_rank
=
tp_rank
,
blocksparse_local_blocks
=
blocksparse_local_blocks
,
blocksparse_vert_stride
=
blocksparse_vert_stride
,
...
...
tests/kernels/test_cache.py
View file @
500b93c8
...
...
@@ -156,11 +156,11 @@ def test_reshape_and_cache(
cloned_value_cache
=
value_cache
.
clone
()
# Using default kv_scale
kv_scale
=
1.0
k
_scale
=
v_scale
=
1.0
# Call the reshape_and_cache kernel.
ops
.
reshape_and_cache
(
key
,
value
,
key_cache
,
value_cache
,
slot_mapping
,
kv_cache_dtype
,
kv_scale
)
kv_cache_dtype
,
k
_scale
,
v_scale
)
if
kv_cache_dtype
==
"fp8"
:
result_key_cache
=
torch
.
empty_like
(
key_cache
,
dtype
=
torch
.
float16
)
...
...
tests/kernels/test_fp8_quant.py
0 → 100644
View file @
500b93c8
import
pytest
import
torch
import
vllm._custom_ops
as
ops
from
tests.kernels.quant_utils
import
(
ref_dynamic_per_tensor_fp8_quant
,
ref_dynamic_per_token_quant
)
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
1
,
2
,
3
,
4
,
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
8193
]
# Arbitrary values for testing
HIDDEN_SIZES
+=
list
(
range
(
1024
,
1033
))
# vectorized conversion edge cases
NUM_TOKENS
=
[
1
,
7
,
83
,
4096
]
# Arbitrary values for testing
SCALE_UBS
=
[
True
,
False
]
SEEDS
=
[
0
]
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"scale_ub"
,
SCALE_UBS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_dynamic_per_token_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
scale_ub
:
bool
,
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
+
1e-6
# avoid nans
scale_ub
=
torch
.
mean
(
x
).
to
(
dtype
=
torch
.
float32
,
device
=
'cuda'
)
\
if
scale_ub
else
None
ref_out
,
ref_scales
=
ref_dynamic_per_token_quant
(
x
,
torch
.
float8_e4m3fn
,
scale_ub
)
ops_out
,
ops_scales
=
ops
.
scaled_fp8_quant
(
x
,
scale_ub
=
scale_ub
,
use_per_token_if_dynamic
=
True
)
assert
torch
.
allclose
(
ref_scales
,
ops_scales
)
assert
torch
.
allclose
(
ref_out
.
to
(
dtype
=
torch
.
float32
),
ops_out
.
to
(
dtype
=
torch
.
float32
))
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_dynamic_per_tensor_fp8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
ref_out
,
ref_scale
=
ref_dynamic_per_tensor_fp8_quant
(
x
)
ops_out
,
ops_scale
=
ops
.
scaled_fp8_quant
(
x
)
assert
torch
.
allclose
(
ref_scale
,
ops_scale
)
assert
torch
.
allclose
(
ref_out
.
to
(
dtype
=
torch
.
float32
),
ops_out
.
to
(
dtype
=
torch
.
float32
))
# Regression test for a case with large activations where an int32 index cannot
# represent the number of elements.
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
def
test_fp8_quant_large
(
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
num_tokens
=
1024000
# Mistral-Nemo's max_position_embeddings
hidden_size
=
1152
# Smallest hidden_size to reproduce the error
dtype
=
torch
.
bfloat16
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
ref_out
,
scale
=
ref_dynamic_per_tensor_fp8_quant
(
x
)
ops_out
,
_
=
ops
.
scaled_fp8_quant
(
x
,
scale
)
# Minimize memory footprint in this test by freeing x and upconverting
# the outputs in place. (torch.allclose does not support fp8)
del
x
ref_out
=
ref_out
.
to
(
dtype
=
dtype
)
ops_out
=
ops_out
.
to
(
dtype
=
dtype
)
assert
torch
.
allclose
(
ref_out
,
ops_out
)
tests/kernels/test_int8_quant.py
View file @
500b93c8
...
...
@@ -3,6 +3,8 @@ import torch
# ruff: noqa: F401
import
vllm._C
from
tests.kernels.quant_utils
import
ref_dynamic_per_token_quant
from
vllm._custom_ops
import
scaled_int8_quant
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
...
...
@@ -21,23 +23,16 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
x_token_max
,
_
=
x
.
max
(
dim
=
1
)
x_token_max
=
x_token_max
.
to
(
dtype
=
torch
.
float32
)
scales
=
(
x_token_max
/
float
(
127.0
))[:,
None
].
to
(
device
=
"cuda"
,
dtype
=
torch
.
float32
)
torch_out
=
(
x
/
scales
).
round
().
clamp
(
int8_traits
.
min
,
int8_traits
.
max
).
to
(
torch
.
int8
)
ops_out
=
torch
.
empty_like
(
x
,
dtype
=
torch
.
int8
,
device
=
"cuda"
)
scales_out
=
torch
.
empty_like
(
scales
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
torch
.
ops
.
_C
.
dynamic_scaled_int8_quant
(
ops_out
,
x
,
scales_out
)
# reference
ref_out
,
ref_scales
=
ref_dynamic_per_token_quant
(
x
,
torch
.
int8
)
# kernel
ops_out
,
ops_scales
=
scaled_int8_quant
(
x
)
assert
torch
.
allclose
(
scales
_out
,
scales
)
assert
torch
.
allclose
(
torch
_out
,
ops
_out
,
assert
torch
.
allclose
(
ops_
scales
,
ref_
scales
)
assert
torch
.
allclose
(
ops
_out
,
ref
_out
,
atol
=
1
)
# big atol to account for rounding errors
...
...
@@ -55,12 +50,11 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
scale
=
torch
.
tensor
([
scale
],
dtype
=
torch
.
float32
,
device
=
"cuda"
)
out1
=
(
x
/
scale
).
round
().
clamp
(
int8_traits
.
min
,
int8_traits
.
max
).
to
(
torch
.
int8
)
out2
=
torch
.
empty_like
(
x
,
dtype
=
torch
.
int8
)
scale_argument
=
torch
.
tensor
([
scale
],
dtype
=
torch
.
float32
,
device
=
"cuda"
)
out2
,
_
=
scaled_int8_quant
(
x
,
scale
)
torch
.
ops
.
_C
.
static_scaled_int8_quant
(
out2
,
x
,
scale_argument
)
assert
torch
.
allclose
(
out1
,
out2
,
atol
=
1
)
# big atol to account for rounding errors
tests/kernels/test_marlin_gemm.py
View file @
500b93c8
...
...
@@ -12,17 +12,18 @@ from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
,
GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils
import
(
GPTQ_MARLIN_MAX_PARALLEL
,
GPTQ_MARLIN_MIN_THREAD_N
,
GPTQ_
MARLIN_SUPPORTED_GROUP_SIZES
,
GPTQ_
MARLIN_SUPPORTED_NUM_BITS
,
marlin_permute_scales
)
MARLIN_SUPPORTED_GROUP_SIZES
,
MARLIN_SUPPORTED_NUM_BITS
,
marlin_make_empty_g_idx
,
marlin_permute_scales
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_fp8
import
(
pack_fp8_to_int32
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test
import
(
MarlinWorkspace
,
get_weight_perm
,
marlin_quantize
,
marlin_weights
)
MarlinWorkspace
,
awq_marlin_quantize
,
get_weight_perm
,
marlin_quantize
,
marlin_weights
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_test_24
import
(
marlin_24_quantize
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
gptq_pack
,
quantize_weights
,
sort_weights
)
from
vllm.utils
import
is_hip
awq_pack
,
gptq_pack
,
quantize_weights
,
quantize_weights_with_zp
,
sort_weights
)
ACT_ORDER_OPTS
=
[
False
,
True
]
K_FULL_OPTS
=
[
False
,
True
]
...
...
@@ -58,12 +59,12 @@ def rand_data(shape, dtype=torch.float16):
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
GPTQ_
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
GPTQ_
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"act_order"
,
ACT_ORDER_OPTS
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_marlin_repack
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
act_order
,
mnk_factors
):
def
test_
gptq_
marlin_repack
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
act_order
,
mnk_factors
):
m_factor
,
n_factor
,
k_factor
=
mnk_factors
size_m
=
m_factor
...
...
@@ -121,12 +122,60 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
GPTQ_MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_awq_marlin_repack
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
mnk_factors
):
m_factor
,
n_factor
,
k_factor
=
mnk_factors
size_m
=
m_factor
size_k
=
k_chunk
*
k_factor
size_n
=
n_chunk
*
n_factor
print
(
f
"MNK =
{
size_m
}
{
size_n
}
{
size_k
}
"
)
# Normalize group_size
if
group_size
==
-
1
:
group_size
=
size_k
assert
group_size
<=
size_k
# Create input
b_weight
=
rand_data
((
size_k
,
size_n
))
# Quantize
w_ref
,
q_w
,
s
,
zp
=
quantize_weights_with_zp
(
b_weight
,
num_bits
,
group_size
)
# Pack to AWQ format
q_w_awq
=
awq_pack
(
q_w
,
num_bits
,
size_k
,
size_n
)
# Pack to Marlin format
weight_perm
=
get_weight_perm
(
num_bits
)
marlin_q_w_1
=
marlin_weights
(
q_w
,
size_k
,
size_n
,
num_bits
,
weight_perm
)
# Run Marlin repack GPU kernel
marlin_q_w_2
=
ops
.
awq_marlin_repack
(
q_w_awq
,
size_k
,
size_n
,
num_bits
,
)
torch
.
cuda
.
synchronize
()
assert
torch
.
allclose
(
marlin_q_w_1
,
marlin_q_w_2
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"act_order"
,
ACT_ORDER_OPTS
)
@
pytest
.
mark
.
parametrize
(
"is_k_full"
,
K_FULL_OPTS
)
def
test_marlin_gemm
(
def
test_
gptq_
marlin_gemm
(
k_chunk
,
n_chunk
,
num_bits
,
...
...
@@ -156,6 +205,8 @@ def test_marlin_gemm(
w_ref
,
marlin_q_w
,
marlin_s
,
g_idx
,
sort_indices
,
_
=
marlin_quantize
(
b_weight
,
num_bits
,
group_size
,
act_order
)
marlin_zp
=
marlin_make_empty_g_idx
(
marlin_s
.
device
)
workspace
=
MarlinWorkspace
(
size_n
,
GPTQ_MARLIN_MIN_THREAD_N
,
GPTQ_MARLIN_MAX_PARALLEL
)
...
...
@@ -163,6 +214,7 @@ def test_marlin_gemm(
a_input
,
marlin_q_w
,
marlin_s
,
marlin_zp
,
g_idx
,
sort_indices
,
workspace
.
scratch
,
...
...
@@ -171,6 +223,7 @@ def test_marlin_gemm(
b_weight
.
shape
[
1
],
a_input
.
shape
[
1
],
is_k_full
,
has_zp
=
False
,
)
output_ref
=
torch
.
matmul
(
a_input
,
w_ref
)
...
...
@@ -189,7 +242,8 @@ def test_marlin_gemm(
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_marlin_24_gemm
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
mnk_factors
):
def
test_gptq_marlin_24_gemm
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
mnk_factors
):
m_factor
,
n_factor
,
k_factor
=
mnk_factors
size_m
=
m_factor
...
...
@@ -302,3 +356,65 @@ def test_fp8_marlin_gemm(
print
(
"max_diff = {}"
.
format
(
max_diff
))
assert
max_diff
<
0.04
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"k_chunk"
,
MARLIN_K_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"n_chunk"
,
MARLIN_N_CHUNKS
)
@
pytest
.
mark
.
parametrize
(
"num_bits"
,
MARLIN_SUPPORTED_NUM_BITS
)
@
pytest
.
mark
.
parametrize
(
"group_size"
,
MARLIN_SUPPORTED_GROUP_SIZES
)
@
pytest
.
mark
.
parametrize
(
"mnk_factors"
,
MNK_FACTORS
)
def
test_awq_marlin_gemm
(
k_chunk
,
n_chunk
,
num_bits
,
group_size
,
mnk_factors
,
):
m_factor
,
n_factor
,
k_factor
=
mnk_factors
size_m
=
m_factor
size_k
=
k_chunk
*
k_factor
size_n
=
n_chunk
*
n_factor
print
(
f
"MNK =
{
size_m
}
{
size_n
}
{
size_k
}
"
)
print
(
f
"groupsize =
{
group_size
}
"
)
a_input
=
rand_data
((
size_m
,
size_k
))
b_weight
=
rand_data
((
size_k
,
size_n
))
w_ref
,
marlin_q_w
,
marlin_s
,
marlin_zp
=
awq_marlin_quantize
(
b_weight
,
num_bits
,
group_size
)
g_idx
=
torch
.
empty
(
0
,
dtype
=
torch
.
int
,
device
=
marlin_q_w
.
device
)
sort_indices
=
torch
.
empty
(
0
,
dtype
=
torch
.
int
,
device
=
marlin_q_w
.
device
)
is_k_full
=
True
has_zp
=
True
workspace
=
MarlinWorkspace
(
size_n
,
GPTQ_MARLIN_MIN_THREAD_N
,
GPTQ_MARLIN_MAX_PARALLEL
)
output
=
ops
.
gptq_marlin_gemm
(
a_input
,
marlin_q_w
,
marlin_s
,
marlin_zp
,
g_idx
,
sort_indices
,
workspace
.
scratch
,
num_bits
,
a_input
.
shape
[
0
],
b_weight
.
shape
[
1
],
a_input
.
shape
[
1
],
is_k_full
,
has_zp
,
)
output_ref
=
torch
.
matmul
(
a_input
,
w_ref
)
torch
.
cuda
.
synchronize
()
max_diff
=
compute_max_diff
(
output
,
output_ref
)
print
(
"max_diff = {}"
.
format
(
max_diff
))
assert
max_diff
<
0.04
tests/lora/conftest.py
View file @
500b93c8
...
...
@@ -159,8 +159,14 @@ def dummy_model_gate_up() -> nn.Module:
@
pytest
.
fixture
(
scope
=
"session"
)
def
sql_lora_files
():
return
snapshot_download
(
repo_id
=
"yard1/llama-2-7b-sql-lora-test"
)
def
sql_lora_huggingface_id
():
# huggingface repo id is used to test lora runtime downloading.
return
"yard1/llama-2-7b-sql-lora-test"
@
pytest
.
fixture
(
scope
=
"session"
)
def
sql_lora_files
(
sql_lora_huggingface_id
):
return
snapshot_download
(
repo_id
=
sql_lora_huggingface_id
)
@
pytest
.
fixture
(
scope
=
"session"
)
...
...
tests/lora/test_long_context.py
View file @
500b93c8
...
...
@@ -29,7 +29,7 @@ def _create_lora_request(lora_id, long_context_infos):
context_len
=
long_context_infos
[
lora_id
][
"context_length"
]
scaling_factor
=
context_len_to_scaling_factor
[
context_len
]
return
LoRARequest
(
context_len
,
lora_id
,
long_context_infos
[
lora_id
][
"lora"
],
long_context_infos
[
lora_id
][
"lora"
],
None
,
4096
*
scaling_factor
)
...
...
Prev
1
2
3
4
5
6
7
8
9
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment