Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1190e964
Commit
1190e964
authored
Jul 31, 2024
by
zhuwenwen
Browse files
update benchmarks and examples
parent
69185c0b
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
25 additions
and
2 deletions
+25
-2
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+23
-0
examples/offline_inference.py
examples/offline_inference.py
+1
-1
setup.py
setup.py
+1
-1
No files found.
benchmarks/benchmark_throughput.py
View file @
1190e964
...
...
@@ -5,11 +5,13 @@ import random
import
time
from
typing
import
List
,
Optional
,
Tuple
import
numpy
as
np
import
torch
from
tqdm
import
tqdm
from
transformers
import
(
AutoModelForCausalLM
,
AutoTokenizer
,
PreTrainedTokenizerBase
)
from
vllm.inputs
import
PromptStrictInputs
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
...
...
@@ -119,6 +121,23 @@ def run_vllm(
max_tokens
=
output_len
,
))
# warmup
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
size
=
(
args
.
num_prompts
,
args
.
input_len
))
dummy_inputs
:
List
[
PromptStrictInputs
]
=
[{
"prompt_token_ids"
:
batch
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
def
run_to_completion
():
llm
.
generate
(
dummy_inputs
,
sampling_params
=
sampling_params
,
use_tqdm
=
False
)
print
(
"Warming up..."
)
for
_
in
tqdm
(
range
(
args
.
num_iters_warmup
),
desc
=
"Warmup iterations"
):
run_to_completion
()
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
...
...
@@ -295,6 +314,10 @@ if __name__ == "__main__":
default
=
1
,
help
=
"Number of generated sequences per prompt."
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
'--num-iters-warmup'
,
type
=
int
,
default
=
1
,
help
=
'Number of iterations to run for warmup.'
)
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
default
=
1000
,
...
...
examples/offline_inference.py
View file @
1190e964
...
...
@@ -12,7 +12,7 @@ if __name__ == '__main__':
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Create an LLM.
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
trust_remote_code
=
True
,
dtype
=
"float16"
,
enforce_eager
=
Fals
e
)
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
trust_remote_code
=
True
,
dtype
=
"float16"
,
enforce_eager
=
Tru
e
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
...
...
setup.py
View file @
1190e964
...
...
@@ -332,7 +332,7 @@ def get_version_add(sha: Optional[str] = None) -> str:
if
sha
!=
'Unknown'
:
if
sha
is
None
:
sha
=
get_sha
(
vllm_root
)
version
=
'das1.
1
.git'
+
sha
[:
7
]
version
=
'das1.
2
.git'
+
sha
[:
7
]
# abi version
version
+=
"."
+
get_abi
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment