Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0f465ab5
Unverified
Commit
0f465ab5
authored
Jan 28, 2025
by
Gabriel Marinho
Committed by
GitHub
Jan 28, 2025
Browse files
[FEATURE] Enables offline /score for embedding models (#12021)
Signed-off-by:
Gabriel Marinho
<
gmarinho@ibm.com
>
parent
23a7cbc8
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
216 additions
and
44 deletions
+216
-44
tests/models/embedding/language/test_scoring.py
tests/models/embedding/language/test_scoring.py
+100
-0
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+116
-44
No files found.
tests/models/embedding/language/test_scoring.py
View file @
0f465ab5
...
...
@@ -5,12 +5,18 @@ Run `pytest tests/models/embedding/language/test_scoring.py`.
import
math
import
pytest
import
torch
import
torch.nn.functional
as
F
MODELS
=
[
"cross-encoder/ms-marco-MiniLM-L-6-v2"
,
# Bert
"BAAI/bge-reranker-v2-m3"
,
# Roberta
]
EMBEDDING_MODELS
=
[
"sentence-transformers/all-MiniLM-L12-v2"
,
]
TEXTS_1
=
[
"What is the capital of France?"
,
"What is the capital of Germany?"
,
...
...
@@ -87,3 +93,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
assert
math
.
isclose
(
hf_outputs
[
0
],
vllm_outputs
[
0
],
rel_tol
=
0.01
)
assert
math
.
isclose
(
hf_outputs
[
1
],
vllm_outputs
[
1
],
rel_tol
=
0.01
)
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
EMBEDDING_MODELS
)
def
emb_model_name
(
request
):
yield
request
.
param
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_llm_1_to_1_embedding
(
vllm_runner
,
hf_runner
,
emb_model_name
,
dtype
:
str
):
text_pair
=
[
TEXTS_1
[
0
],
TEXTS_2
[
0
]]
with
hf_runner
(
emb_model_name
,
dtype
=
dtype
,
is_sentence_transformer
=
True
)
as
hf_model
:
hf_embeddings
=
hf_model
.
encode
(
text_pair
)
hf_outputs
=
[
F
.
cosine_similarity
(
*
map
(
torch
.
tensor
,
hf_embeddings
),
dim
=
0
)
]
with
vllm_runner
(
emb_model_name
,
task
=
"embed"
,
dtype
=
dtype
,
max_model_len
=
None
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
score
(
text_pair
[
0
],
text_pair
[
1
])
assert
len
(
vllm_outputs
)
==
1
assert
len
(
hf_outputs
)
==
1
assert
math
.
isclose
(
hf_outputs
[
0
],
vllm_outputs
[
0
],
rel_tol
=
0.01
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_llm_1_to_N_embedding
(
vllm_runner
,
hf_runner
,
emb_model_name
,
dtype
:
str
):
text_pairs
=
[
[
TEXTS_1
[
0
],
TEXTS_2
[
0
]],
[
TEXTS_1
[
0
],
TEXTS_2
[
1
]],
]
with
hf_runner
(
emb_model_name
,
dtype
=
dtype
,
is_sentence_transformer
=
True
)
as
hf_model
:
hf_embeddings
=
[
hf_model
.
encode
(
text_pair
)
for
text_pair
in
text_pairs
]
hf_outputs
=
[
F
.
cosine_similarity
(
*
map
(
torch
.
tensor
,
pair
),
dim
=
0
)
for
pair
in
hf_embeddings
]
with
vllm_runner
(
emb_model_name
,
task
=
"embed"
,
dtype
=
dtype
,
max_model_len
=
None
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
score
(
TEXTS_1
[
0
],
TEXTS_2
)
assert
len
(
vllm_outputs
)
==
2
assert
len
(
hf_outputs
)
==
2
assert
math
.
isclose
(
hf_outputs
[
0
],
vllm_outputs
[
0
],
rel_tol
=
0.01
)
assert
math
.
isclose
(
hf_outputs
[
1
],
vllm_outputs
[
1
],
rel_tol
=
0.01
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_llm_N_to_N_embedding
(
vllm_runner
,
hf_runner
,
emb_model_name
,
dtype
:
str
):
text_pairs
=
[
[
TEXTS_1
[
0
],
TEXTS_2
[
0
]],
[
TEXTS_1
[
1
],
TEXTS_2
[
1
]],
]
with
hf_runner
(
emb_model_name
,
dtype
=
dtype
,
is_sentence_transformer
=
True
)
as
hf_model
:
hf_embeddings
=
[
hf_model
.
encode
(
text_pair
)
for
text_pair
in
text_pairs
]
hf_outputs
=
[
F
.
cosine_similarity
(
*
map
(
torch
.
tensor
,
pair
),
dim
=
0
)
for
pair
in
hf_embeddings
]
with
vllm_runner
(
emb_model_name
,
task
=
"embed"
,
dtype
=
dtype
,
max_model_len
=
None
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
score
(
TEXTS_1
,
TEXTS_2
)
assert
len
(
vllm_outputs
)
==
2
assert
len
(
hf_outputs
)
==
2
assert
math
.
isclose
(
hf_outputs
[
0
],
vllm_outputs
[
0
],
rel_tol
=
0.01
)
assert
math
.
isclose
(
hf_outputs
[
1
],
vllm_outputs
[
1
],
rel_tol
=
0.01
)
vllm/entrypoints/llm.py
View file @
0f465ab5
...
...
@@ -5,6 +5,7 @@ from typing import (Any, Callable, ClassVar, Dict, List, Optional, Sequence,
Tuple
,
Type
,
Union
,
cast
,
overload
)
import
cloudpickle
import
torch
import
torch.nn
as
nn
from
tqdm
import
tqdm
from
typing_extensions
import
TypeVar
,
deprecated
...
...
@@ -996,6 +997,107 @@ class LLM:
return
[
ClassificationRequestOutput
.
from_base
(
item
)
for
item
in
items
]
def
_embedding_score
(
self
,
tokenizer
:
AnyTokenizer
,
text_1
:
List
[
Union
[
str
,
TextPrompt
,
TokensPrompt
]],
text_2
:
List
[
Union
[
str
,
TextPrompt
,
TokensPrompt
]],
truncate_prompt_tokens
:
Optional
[
int
]
=
None
,
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
)
->
List
[
ScoringRequestOutput
]:
encoded_output
=
self
.
encode
(
text_1
+
text_2
,
use_tqdm
=
use_tqdm
,
lora_request
=
lora_request
,
prompt_adapter_request
=
prompt_adapter_request
)
encoded_output_1
=
encoded_output
[
0
:
len
(
text_1
)]
encoded_output_2
=
encoded_output
[
len
(
text_1
):]
if
len
(
encoded_output_1
)
==
1
:
encoded_output_1
=
encoded_output_1
*
len
(
encoded_output_2
)
output_pairs
=
[(
t1
,
t2
)
for
t1
,
t2
in
zip
(
encoded_output_1
,
encoded_output_2
)]
scores
=
[]
scorer
=
torch
.
nn
.
CosineSimilarity
(
0
)
for
embed_1
,
embed_2
in
output_pairs
:
pair_score
=
scorer
(
embed_1
.
outputs
.
data
,
embed_2
.
outputs
.
data
)
if
(
pad_token_id
:
=
getattr
(
tokenizer
,
"pad_token_id"
,
None
))
is
not
None
:
tokens
=
embed_1
.
prompt_token_ids
+
[
pad_token_id
]
+
embed_2
.
prompt_token_ids
else
:
tokens
=
embed_1
.
prompt_token_ids
+
embed_2
.
prompt_token_ids
scores
.
append
(
PoolingRequestOutput
(
request_id
=
f
"
{
embed_1
.
request_id
}
_
{
embed_2
.
request_id
}
"
,
outputs
=
pair_score
,
prompt_token_ids
=
tokens
,
finished
=
True
))
items
=
self
.
engine_class
.
validate_outputs
(
scores
,
PoolingRequestOutput
)
return
[
ScoringRequestOutput
.
from_base
(
item
)
for
item
in
items
]
def
_cross_encoding_score
(
self
,
tokenizer
:
Union
[
AnyTokenizer
],
text_1
:
List
[
Union
[
str
,
TextPrompt
,
TokensPrompt
]],
text_2
:
List
[
Union
[
str
,
TextPrompt
,
TokensPrompt
]],
truncate_prompt_tokens
:
Optional
[
int
]
=
None
,
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
Union
[
List
[
LoRARequest
],
LoRARequest
]]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
)
->
List
[
ScoringRequestOutput
]:
if
isinstance
(
tokenizer
,
MistralTokenizer
):
raise
ValueError
(
"Score API is only enabled for `--task embed or score`"
)
if
len
(
text_1
)
==
1
:
text_1
=
text_1
*
len
(
text_2
)
input_pairs
=
[(
t1
,
t2
)
for
t1
,
t2
in
zip
(
text_1
,
text_2
)]
pooling_params
=
PoolingParams
()
tokenization_kwargs
:
Dict
[
str
,
Any
]
=
{}
if
truncate_prompt_tokens
is
not
None
:
tokenization_kwargs
[
"truncation"
]
=
True
tokenization_kwargs
[
"max_length"
]
=
truncate_prompt_tokens
parsed_prompts
=
[]
for
q
,
t
in
input_pairs
:
prompt_inputs
=
tokenizer
(
text
=
q
,
text_pair
=
t
,
**
tokenization_kwargs
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_inputs
[
"input_ids"
],
token_type_ids
=
prompt_inputs
.
get
(
"token_type_ids"
))
parsed_prompts
.
append
(
engine_prompt
)
self
.
_validate_and_add_requests
(
prompts
=
parsed_prompts
,
params
=
pooling_params
,
lora_request
=
lora_request
,
prompt_adapter_request
=
prompt_adapter_request
,
)
outputs
=
self
.
_run_engine
(
use_tqdm
=
use_tqdm
)
items
=
self
.
engine_class
.
validate_outputs
(
outputs
,
PoolingRequestOutput
)
return
[
ScoringRequestOutput
.
from_base
(
item
)
for
item
in
items
]
def
score
(
self
,
text_1
:
Union
[
SingletonPrompt
,
Sequence
[
SingletonPrompt
]],
...
...
@@ -1047,25 +1149,20 @@ class LLM:
raise
ValueError
(
" "
.
join
(
messages
))
if
not
self
.
llm_engine
.
model_config
.
is_cross_encoder
:
raise
ValueError
(
"Your model does not support cross encoding"
)
if
self
.
llm_engine
.
model_config
.
task
!=
"score"
:
raise
ValueError
(
"Score API is only enabled for `--task score`"
)
tokenizer
=
self
.
llm_engine
.
get_tokenizer
()
if
isinstance
(
tokenizer
,
MistralTokenizer
):
if
self
.
llm_engine
.
model_config
.
task
not
in
(
"embed"
,
"score"
):
raise
ValueError
(
"
MistralTokenizer not support
ed
f
or
cross-encoding
"
)
"
Score API is only enabled for `--task emb
ed or
--task score`
"
)
# the tokenizer for models such as
# "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
# lists of tokens to the `text` and `text_pair` kwargs
tokenizer
=
self
.
llm_engine
.
get_tokenizer
()
def
ensure_str
(
prompt
:
SingletonPrompt
):
if
isinstance
(
prompt
,
dict
):
if
"multi_modal_data"
in
prompt
:
raise
ValueError
(
"Multi-modal prompt is not "
"supported for
cross en
co
d
ing"
)
"supported for
s
co
r
ing"
)
elif
"prompt_token_ids"
in
prompt
:
prompt
=
tokenizer
.
decode
(
cast
(
TokensPrompt
,
prompt
)[
"prompt_token_ids"
])
...
...
@@ -1091,40 +1188,15 @@ class LLM:
if
len
(
text_2
)
==
0
:
raise
ValueError
(
"At least one text_pair element must be given"
)
if
len
(
text_1
)
==
1
:
text_1
=
text_1
*
len
(
text_2
)
input_pairs
=
[(
t1
,
t2
)
for
t1
,
t2
in
zip
(
text_1
,
text_2
)]
pooling_params
=
PoolingParams
()
tokenization_kwargs
:
Dict
[
str
,
Any
]
=
{}
if
truncate_prompt_tokens
is
not
None
:
tokenization_kwargs
[
"truncation"
]
=
True
tokenization_kwargs
[
"max_length"
]
=
truncate_prompt_tokens
parsed_prompts
=
[]
for
q
,
t
in
input_pairs
:
prompt_inputs
=
tokenizer
(
text
=
q
,
text_pair
=
t
,
**
tokenization_kwargs
)
engine_prompt
=
TokensPrompt
(
prompt_token_ids
=
prompt_inputs
[
"input_ids"
],
token_type_ids
=
prompt_inputs
.
get
(
"token_type_ids"
))
parsed_prompts
.
append
(
engine_prompt
)
self
.
_validate_and_add_requests
(
prompts
=
parsed_prompts
,
params
=
pooling_params
,
lora_request
=
lora_request
,
prompt_adapter_request
=
prompt_adapter_request
,
)
outputs
=
self
.
_run_engine
(
use_tqdm
=
use_tqdm
)
items
=
self
.
engine_class
.
validate_outputs
(
outputs
,
PoolingRequestOutput
)
return
[
ScoringRequestOutput
.
from_base
(
item
)
for
item
in
items
]
if
self
.
llm_engine
.
model_config
.
is_cross_encoder
:
return
self
.
_cross_encoding_score
(
tokenizer
,
text_1
,
text_2
,
truncate_prompt_tokens
,
use_tqdm
,
lora_request
,
prompt_adapter_request
)
else
:
return
self
.
_embedding_score
(
tokenizer
,
text_1
,
text_2
,
truncate_prompt_tokens
,
use_tqdm
,
lora_request
,
prompt_adapter_request
)
def
start_profile
(
self
)
->
None
:
self
.
llm_engine
.
start_profile
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment