Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
992e5c3d
Unverified
Commit
992e5c3d
authored
Feb 20, 2025
by
Harry Mellor
Committed by
GitHub
Feb 20, 2025
Browse files
Merge similar examples in `offline_inference` into single `basic` example (#12737)
parent
b69692a2
Changes
29
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1 addition
and
340 deletions
+1
-340
examples/offline_inference/basic_with_model_default_sampling.py
...es/offline_inference/basic_with_model_default_sampling.py
+0
-32
examples/offline_inference/chat.py
examples/offline_inference/chat.py
+0
-82
examples/offline_inference/classification.py
examples/offline_inference/classification.py
+0
-30
examples/offline_inference/cli.py
examples/offline_inference/cli.py
+0
-82
examples/offline_inference/cpu_offload.py
examples/offline_inference/cpu_offload.py
+0
-24
examples/offline_inference/embedding.py
examples/offline_inference/embedding.py
+0
-30
examples/offline_inference/gguf_inference.py
examples/offline_inference/gguf_inference.py
+0
-34
examples/offline_inference/scoring.py
examples/offline_inference/scoring.py
+0
-25
tests/plugins_tests/test_platform_plugins.py
tests/plugins_tests/test_platform_plugins.py
+1
-1
No files found.
examples/offline_inference/basic_with_model_default_sampling.py
deleted
100644 → 0
View file @
b69692a2
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create an LLM with built-in default generation config.
# The generation config is set to None by default to keep
# the behavior consistent with the previous version.
# If you want to use the default generation config from the model,
# you should set the generation_config to "auto".
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-0.5B-Instruct"
,
generation_config
=
"auto"
)
# Load the default sampling parameters from the model.
sampling_params
=
llm
.
get_default_sampling_params
()
# Modify the sampling parameters if needed.
sampling_params
.
temperature
=
0.5
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
examples/offline_inference/chat.py
deleted
100644 → 0
View file @
b69692a2
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
)
sampling_params
=
SamplingParams
(
temperature
=
0.5
)
def
print_outputs
(
outputs
):
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
print
(
"-"
*
80
)
print
(
"="
*
80
)
# In this script, we demonstrate how to pass input to the chat method:
conversation
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"Hello"
},
{
"role"
:
"assistant"
,
"content"
:
"Hello! How can I assist you today?"
},
{
"role"
:
"user"
,
"content"
:
"Write an essay about the importance of higher education."
,
},
]
outputs
=
llm
.
chat
(
conversation
,
sampling_params
=
sampling_params
,
use_tqdm
=
False
)
print_outputs
(
outputs
)
# You can run batch inference with llm.chat API
conversation
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"Hello"
},
{
"role"
:
"assistant"
,
"content"
:
"Hello! How can I assist you today?"
},
{
"role"
:
"user"
,
"content"
:
"Write an essay about the importance of higher education."
,
},
]
conversations
=
[
conversation
for
_
in
range
(
10
)]
# We turn on tqdm progress bar to verify it's indeed running batch inference
outputs
=
llm
.
chat
(
messages
=
conversations
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
print_outputs
(
outputs
)
# A chat template can be optionally supplied.
# If not, the model will use its default chat template.
# with open('template_falcon_180b.jinja', "r") as f:
# chat_template = f.read()
# outputs = llm.chat(
# conversations,
# sampling_params=sampling_params,
# use_tqdm=False,
# chat_template=chat_template,
# )
examples/offline_inference/classification.py
deleted
100644 → 0
View file @
b69692a2
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create an LLM.
# You should pass task="classify" for classification models
model
=
LLM
(
model
=
"jason9693/Qwen2.5-1.5B-apeach"
,
task
=
"classify"
,
enforce_eager
=
True
,
)
# Generate logits. The output is a list of ClassificationRequestOutputs.
outputs
=
model
.
classify
(
prompts
)
# Print the outputs.
for
prompt
,
output
in
zip
(
prompts
,
outputs
):
probs
=
output
.
outputs
.
probs
probs_trimmed
=
((
str
(
probs
[:
16
])[:
-
1
]
+
", ...]"
)
if
len
(
probs
)
>
16
else
probs
)
print
(
f
"Prompt:
{
prompt
!
r
}
| "
f
"Class Probabilities:
{
probs_trimmed
}
(size=
{
len
(
probs
)
}
)"
)
examples/offline_inference/cli.py
deleted
100644 → 0
View file @
b69692a2
# SPDX-License-Identifier: Apache-2.0
from
dataclasses
import
asdict
from
vllm
import
LLM
,
SamplingParams
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.utils
import
FlexibleArgumentParser
def
get_prompts
(
num_prompts
:
int
):
# The default sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
if
num_prompts
!=
len
(
prompts
):
prompts
=
(
prompts
*
((
num_prompts
//
len
(
prompts
))
+
1
))[:
num_prompts
]
return
prompts
def
main
(
args
):
# Create prompts
prompts
=
get_prompts
(
args
.
num_prompts
)
# Create a sampling params object.
sampling_params
=
SamplingParams
(
n
=
args
.
n
,
temperature
=
args
.
temperature
,
top_p
=
args
.
top_p
,
top_k
=
args
.
top_k
,
max_tokens
=
args
.
max_tokens
)
# Create an LLM.
# The default model is 'facebook/opt-125m'
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
llm
=
LLM
(
**
asdict
(
engine_args
))
# Generate texts from the prompts.
# The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
if
__name__
==
'__main__'
:
parser
=
FlexibleArgumentParser
()
parser
=
EngineArgs
.
add_cli_args
(
parser
)
group
=
parser
.
add_argument_group
(
"SamplingParams options"
)
group
.
add_argument
(
"--num-prompts"
,
type
=
int
,
default
=
4
,
help
=
"Number of prompts used for inference"
)
group
.
add_argument
(
"--max-tokens"
,
type
=
int
,
default
=
16
,
help
=
"Generated output length for sampling"
)
group
.
add_argument
(
'--n'
,
type
=
int
,
default
=
1
,
help
=
'Number of generated sequences per prompt'
)
group
.
add_argument
(
'--temperature'
,
type
=
float
,
default
=
0.8
,
help
=
'Temperature for text generation'
)
group
.
add_argument
(
'--top-p'
,
type
=
float
,
default
=
0.95
,
help
=
'top_p for text generation'
)
group
.
add_argument
(
'--top-k'
,
type
=
int
,
default
=-
1
,
help
=
'top_k for text generation'
)
args
=
parser
.
parse_args
()
main
(
args
)
examples/offline_inference/cpu_offload.py
deleted
100644 → 0
View file @
b69692a2
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
,
SamplingParams
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Create an LLM.
llm
=
LLM
(
model
=
"meta-llama/Llama-2-13b-chat-hf"
,
cpu_offload_gb
=
10
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
examples/offline_inference/embedding.py
deleted
100644 → 0
View file @
b69692a2
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
# Create an LLM.
# You should pass task="embed" for embedding models
model
=
LLM
(
model
=
"intfloat/e5-mistral-7b-instruct"
,
task
=
"embed"
,
enforce_eager
=
True
,
)
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs
=
model
.
embed
(
prompts
)
# Print the outputs.
for
prompt
,
output
in
zip
(
prompts
,
outputs
):
embeds
=
output
.
outputs
.
embedding
embeds_trimmed
=
((
str
(
embeds
[:
16
])[:
-
1
]
+
", ...]"
)
if
len
(
embeds
)
>
16
else
embeds
)
print
(
f
"Prompt:
{
prompt
!
r
}
| "
f
"Embeddings:
{
embeds_trimmed
}
(size=
{
len
(
embeds
)
}
)"
)
examples/offline_inference/gguf_inference.py
deleted
100644 → 0
View file @
b69692a2
# SPDX-License-Identifier: Apache-2.0
from
huggingface_hub
import
hf_hub_download
from
vllm
import
LLM
,
SamplingParams
def
run_gguf_inference
(
model_path
,
tokenizer
):
# Sample prompts.
prompts
=
[
"How many helicopters can a human eat in one sitting?"
,
"What's the future of AI?"
,
]
prompts
=
[[{
"role"
:
"user"
,
"content"
:
prompt
}]
for
prompt
in
prompts
]
# Create a sampling params object.
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
128
)
# Create an LLM.
llm
=
LLM
(
model
=
model_path
,
tokenizer
=
tokenizer
)
outputs
=
llm
.
chat
(
prompts
,
sampling_params
)
# Print the outputs.
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
if
__name__
==
"__main__"
:
repo_id
=
"bartowski/Phi-3-medium-4k-instruct-GGUF"
filename
=
"Phi-3-medium-4k-instruct-IQ2_M.gguf"
tokenizer
=
"microsoft/Phi-3-medium-4k-instruct"
model
=
hf_hub_download
(
repo_id
,
filename
=
filename
)
run_gguf_inference
(
model
,
tokenizer
)
examples/offline_inference/scoring.py
deleted
100644 → 0
View file @
b69692a2
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
# Sample prompts.
text_1
=
"What is the capital of France?"
texts_2
=
[
"The capital of Brazil is Brasilia."
,
"The capital of France is Paris."
]
# Create an LLM.
# You should pass task="score" for cross-encoder models
model
=
LLM
(
model
=
"BAAI/bge-reranker-v2-m3"
,
task
=
"score"
,
enforce_eager
=
True
,
)
# Generate scores. The output is a list of ScoringRequestOutputs.
outputs
=
model
.
score
(
text_1
,
texts_2
)
# Print the outputs.
for
text_2
,
output
in
zip
(
texts_2
,
outputs
):
score
=
output
.
outputs
.
score
print
(
f
"Pair:
{
[
text_1
,
text_2
]
!
r
}
| Score:
{
score
}
"
)
tests/plugins_tests/test_platform_plugins.py
View file @
992e5c3d
...
@@ -14,7 +14,7 @@ def test_platform_plugins():
...
@@ -14,7 +14,7 @@ def test_platform_plugins():
import
os
import
os
example_file
=
os
.
path
.
join
(
example_file
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
current_file
))),
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
current_file
))),
"examples"
,
"offline_inference/basic.py"
)
"examples"
,
"offline_inference/basic
/basic
.py"
)
runpy
.
run_path
(
example_file
)
runpy
.
run_path
(
example_file
)
# check if the plugin is loaded correctly
# check if the plugin is loaded correctly
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment