Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
40542023
Commit
40542023
authored
Feb 24, 2024
by
zhuwenwen
Browse files
merge v0.3.2
parents
5e5b497d
8fbd84bf
Changes
44
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
88 additions
and
2 deletions
+88
-2
vllm/transformers_utils/configs/__init__.py
vllm/transformers_utils/configs/__init__.py
+2
-0
vllm/transformers_utils/configs/olmo.py
vllm/transformers_utils/configs/olmo.py
+72
-0
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+10
-0
vllm/worker/worker.py
vllm/worker/worker.py
+4
-2
No files found.
vllm/transformers_utils/configs/__init__.py
View file @
40542023
from
vllm.transformers_utils.configs.baichuan
import
BaiChuanConfig
from
vllm.transformers_utils.configs.chatglm
import
ChatGLMConfig
from
vllm.transformers_utils.configs.mpt
import
MPTConfig
from
vllm.transformers_utils.configs.olmo
import
OLMoConfig
from
vllm.transformers_utils.configs.qwen
import
QWenConfig
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
...
...
@@ -11,6 +12,7 @@ __all__ = [
"BaiChuanConfig"
,
"ChatGLMConfig"
,
"MPTConfig"
,
"OLMoConfig"
,
"QWenConfig"
,
"RWConfig"
,
]
vllm/transformers_utils/configs/olmo.py
0 → 100644
View file @
40542023
# coding=utf-8
# adapted from https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/configuration_olmo.py
"""OLMo configuration"""
from
transformers
import
PretrainedConfig
class
OLMoConfig
(
PretrainedConfig
):
model_type
=
'olmo'
attribute_map
=
{
'num_attention_heads'
:
'n_heads'
,
'hidden_size'
:
'd_model'
,
'num_hidden_layers'
:
'n_layers'
,
}
# Note that the defaults for these attributes are equivalent to the base GPT2 model.
def
__init__
(
self
,
d_model
=
768
,
n_heads
=
12
,
n_layers
=
12
,
mlp_ratio
=
4
,
mlp_hidden_size
=
None
,
activation_type
=
"swiglu"
,
block_type
=
"sequential"
,
block_group_size
=
1
,
alibi
=
False
,
alibi_bias_max
=
8.0
,
rope
=
False
,
rope_full_precision
=
True
,
multi_query_attention
=
False
,
attention_layer_norm
=
False
,
layer_norm_type
=
"default"
,
layer_norm_with_affine
=
True
,
attention_layer_norm_with_affine
=
True
,
max_sequence_length
=
1024
,
include_bias
=
True
,
bias_for_layer_norm
=
None
,
scale_logits
=
False
,
vocab_size
=
50257
,
embedding_size
=
50304
,
weight_tying
=
True
,
eos_token_id
=
50256
,
pad_token_id
=
50256
,
**
kwargs
,
):
self
.
d_model
=
d_model
self
.
n_heads
=
n_heads
self
.
n_layers
=
n_layers
self
.
mlp_ratio
=
mlp_ratio
self
.
mlp_hidden_size
=
mlp_hidden_size
self
.
activation_type
=
activation_type
self
.
block_type
=
block_type
self
.
block_group_size
=
block_group_size
self
.
alibi
=
alibi
self
.
alibi_bias_max
=
alibi_bias_max
self
.
rope
=
rope
self
.
rope_full_precision
=
rope_full_precision
self
.
multi_query_attention
=
multi_query_attention
self
.
attention_layer_norm
=
attention_layer_norm
self
.
layer_norm_type
=
layer_norm_type
self
.
layer_norm_with_affine
=
layer_norm_with_affine
self
.
attention_layer_norm_with_affine
=
attention_layer_norm_with_affine
self
.
max_sequence_length
=
max_sequence_length
self
.
include_bias
=
include_bias
self
.
bias_for_layer_norm
=
bias_for_layer_norm
self
.
scale_logits
=
scale_logits
self
.
vocab_size
=
vocab_size
self
.
embedding_size
=
embedding_size
self
.
weight_tying
=
weight_tying
self
.
eos_token_id
=
eos_token_id
self
.
pad_token_id
=
pad_token_id
super
().
__init__
(
**
kwargs
)
vllm/worker/model_runner.py
View file @
40542023
...
...
@@ -389,6 +389,7 @@ class ModelRunner:
)
->
SamplingMetadata
:
seq_groups
:
List
[
Tuple
[
List
[
int
],
SamplingParams
]]
=
[]
selected_token_indices
:
List
[
int
]
=
[]
generators
:
List
[
torch
.
Generator
]
=
[]
selected_token_start_idx
=
0
categorized_sample_indices
=
{
t
:
[]
for
t
in
SamplingType
}
categorized_sample_indices_start_idx
=
0
...
...
@@ -419,6 +420,10 @@ class ModelRunner:
selected_token_indices
.
append
(
selected_token_start_idx
+
subquery_len
-
1
)
selected_token_start_idx
+=
max_subquery_len
if
sampling_params
.
seed
is
not
None
:
seq_group_metadata
.
state
.
generator
=
torch
.
Generator
(
device
=
"cuda"
).
manual_seed
(
sampling_params
.
seed
)
else
:
num_seqs
=
len
(
seq_ids
)
selected_token_indices
.
extend
(
...
...
@@ -432,6 +437,9 @@ class ModelRunner:
categorized_sample_indices_start_idx
+
num_seqs
))
categorized_sample_indices_start_idx
+=
num_seqs
if
sampling_params
.
seed
is
not
None
:
generators
.
append
(
seq_group_metadata
.
state
.
generator
)
selected_token_indices
=
_async_h2d
(
selected_token_indices
,
dtype
=
torch
.
long
,
target_device
=
self
.
device
,
...
...
@@ -454,6 +462,7 @@ class ModelRunner:
prompt_lens
=
prompt_lens
,
selected_token_indices
=
selected_token_indices
,
categorized_sample_indices
=
categorized_sample_indices
,
generators
=
generators
,
)
return
sampling_metadata
...
...
@@ -536,6 +545,7 @@ class ModelRunner:
prompt_lens
=
None
,
selected_token_indices
=
metadata_dict
[
"selected_token_indices"
],
categorized_sample_indices
=
None
,
generators
=
None
,
perform_sampling
=
False
,
)
...
...
vllm/worker/worker.py
View file @
40542023
...
...
@@ -93,8 +93,6 @@ class Worker:
# Initialize the distributed environment.
init_distributed_environment
(
self
.
parallel_config
,
self
.
rank
,
cupy_port
,
self
.
distributed_init_method
)
if
not
self
.
parallel_config
.
disable_custom_all_reduce
:
init_custom_ar
()
# Initialize the model.
set_random_seed
(
self
.
model_config
.
seed
)
...
...
@@ -288,6 +286,10 @@ def init_distributed_environment(
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
)
# Initialize a custom fast all-reduce implementation.
if
not
parallel_config
.
disable_custom_all_reduce
:
init_custom_ar
()
def
_check_if_gpu_supports_dtype
(
torch_dtype
:
torch
.
dtype
):
# Check if the GPU supports the dtype.
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment