Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
257afc37
Unverified
Commit
257afc37
authored
Aug 29, 2024
by
Harsha vardhan manoj Bikki
Committed by
GitHub
Aug 29, 2024
Browse files
[Neuron] Adding support for context-lenght, token-gen buckets. (#7885)
Co-authored-by:
Harsha Bikki
<
harbikh@amazon.com
>
parent
86a677de
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
33 additions
and
11 deletions
+33
-11
examples/offline_inference_neuron.py
examples/offline_inference_neuron.py
+9
-2
vllm/model_executor/model_loader/neuron.py
vllm/model_executor/model_loader/neuron.py
+24
-9
No files found.
examples/offline_inference_neuron.py
View file @
257afc37
import
os
from
vllm
import
LLM
,
SamplingParams
# creates XLA hlo graphs for all the context length buckets.
os
.
environ
[
'NEURON_CONTEXT_LENGTH_BUCKETS'
]
=
"128,512,1024,2048"
# creates XLA hlo graphs for all the token gen buckets.
os
.
environ
[
'NEURON_TOKEN_GEN_BUCKETS'
]
=
"128,512,1024,2048"
# Sample prompts.
prompts
=
[
"Hello, my name is"
,
...
...
@@ -19,8 +26,8 @@ llm = LLM(
# Currently, this is a known limitation in continuous batching support
# in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len
=
1
28
,
block_size
=
1
28
,
max_model_len
=
2
04
8
,
block_size
=
2
04
8
,
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
...
...
vllm/model_executor/model_loader/neuron.py
View file @
257afc37
"""Utilities for selecting and loading neuron models."""
import
importlib
import
os
from
typing
import
Dict
,
Optional
,
Tuple
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
...
...
@@ -109,6 +109,17 @@ def _get_model_architecture(config: PretrainedConfig) -> str:
f
"
{
list
(
_NEURON_SUPPORTED_MODELS
.
keys
())
}
"
)
def
_get_buckets
(
env
:
str
,
default_value
:
List
[
int
])
->
List
[
int
]:
env_value
=
os
.
getenv
(
env
)
if
env_value
is
None
:
return
default_value
buckets_remove_empty
=
filter
(
lambda
x
:
x
is
not
None
and
len
(
x
.
strip
())
>
0
,
env_value
.
split
(
","
))
buckets_int
=
map
(
int
,
buckets_remove_empty
)
buckets_list
=
list
(
buckets_int
)
return
buckets_list
def
get_neuron_model
(
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
)
->
nn
.
Module
:
...
...
@@ -123,14 +134,18 @@ def get_neuron_model(model_config: ModelConfig,
neuron_config
=
NeuronConfig
(
continuous_batching
=
continuous_batching_config
)
context_length_estimates
=
_get_buckets
(
"NEURON_CONTEXT_LENGTH_BUCKETS"
,
[
scheduler_config
.
max_model_len
])
n_positions
=
_get_buckets
(
"NEURON_TOKEN_GEN_BUCKETS"
,
[
scheduler_config
.
max_model_len
])
# Load the weights from the cached or downloaded files.
model
.
load_weights
(
model_config
.
model
,
tp_degree
=
parallel_config
.
tensor_parallel_size
,
amp
=
TORCH_DTYPE_TO_NEURON_AMP
[
model_config
.
dtype
],
neuron_config
=
neuron_config
,
context_length_estimate
=
[
scheduler_config
.
max_model_len
],
n_positions
=
[
scheduler_config
.
max_model_len
],
batch_size
=
scheduler_config
.
max_num_seqs
)
model
.
load_weights
(
model_config
.
model
,
tp_degree
=
parallel_config
.
tensor_parallel_size
,
amp
=
TORCH_DTYPE_TO_NEURON_AMP
[
model_config
.
dtype
],
neuron_config
=
neuron_config
,
context_length_estimate
=
context_length_estimates
,
n_positions
=
n_positions
,
batch_size
=
scheduler_config
.
max_num_seqs
)
return
model
.
eval
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment