Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0a573867
Unverified
Commit
0a573867
authored
Apr 07, 2025
by
Cyrus Leung
Committed by
GitHub
Apr 07, 2025
Browse files
[Misc] Update Mistral-3.1 example (#16147)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
3749e287
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
22 additions
and
8 deletions
+22
-8
examples/offline_inference/mistral-small.py
examples/offline_inference/mistral-small.py
+22
-8
No files found.
examples/offline_inference/mistral-small.py
View file @
0a573867
...
...
@@ -13,9 +13,14 @@ from vllm.sampling_params import SamplingParams
# - Server:
#
# ```bash
# # Mistral format
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
# --tokenizer-mode mistral --config-format mistral --load-format mistral \
# --limit-mm-per-prompt 'image=4' --max-model-len 16384
#
# # HF format
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
# --limit-mm-per-prompt 'image=4' --max-model-len 16384
# ```
#
# - Client:
...
...
@@ -44,19 +49,22 @@ from vllm.sampling_params import SamplingParams
# python demo.py simple
# python demo.py advanced
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
# These scripts have been tested on 2x L40 GPUs
def
run_simple_demo
(
args
:
argparse
.
Namespace
):
model_name
=
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
sampling_params
=
SamplingParams
(
max_tokens
=
8192
)
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"mistral"
,
config_format
=
"mistral"
,
load_format
=
"mistral"
,
tokenizer_mode
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
config_format
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
load_format
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
tensor_parallel_size
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
...
...
@@ -88,17 +96,18 @@ def run_simple_demo(args: argparse.Namespace):
def
run_advanced_demo
(
args
:
argparse
.
Namespace
):
model_name
=
"mistralai/Mistral-Small-3.1-24B-Instruct-2503"
max_img_per_msg
=
5
max_img_per_msg
=
3
max_tokens_per_img
=
4096
sampling_params
=
SamplingParams
(
max_tokens
=
8192
,
temperature
=
0.7
)
llm
=
LLM
(
model
=
model_name
,
tokenizer_mode
=
"mistral"
,
config_format
=
"mistral"
,
load_format
=
"mistral"
,
tokenizer_mode
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
config_format
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
load_format
=
"mistral"
if
args
.
format
==
"mistral"
else
"auto"
,
limit_mm_per_prompt
=
{
"image"
:
max_img_per_msg
},
max_model_len
=
max_img_per_msg
*
max_tokens_per_img
,
tensor_parallel_size
=
2
,
disable_mm_preprocessor_cache
=
args
.
disable_mm_preprocessor_cache
,
)
...
...
@@ -166,6 +175,11 @@ def main():
help
=
"Specify the demo mode: 'simple' or 'advanced'"
,
)
parser
.
add_argument
(
'--format'
,
choices
=
[
"mistral"
,
"hf"
],
default
=
"mistral"
,
help
=
'Specify the format of the model to load.'
)
parser
.
add_argument
(
'--disable-mm-preprocessor-cache'
,
action
=
'store_true'
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment