Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1a95f10e
Unverified
Commit
1a95f10e
authored
Nov 08, 2024
by
youkaichao
Committed by
GitHub
Nov 09, 2024
Browse files
[5/N] pass the whole config to model (#9983)
Signed-off-by:
youkaichao
<
youkaichao@gmail.com
>
parent
49d2a41a
Changes
75
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
140 additions
and
223 deletions
+140
-223
vllm/model_executor/model_loader/loader.py
vllm/model_executor/model_loader/loader.py
+7
-93
vllm/model_executor/model_loader/tensorizer.py
vllm/model_executor/model_loader/tensorizer.py
+3
-12
vllm/model_executor/models/arctic.py
vllm/model_executor/models/arctic.py
+9
-7
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/baichuan.py
+16
-21
vllm/model_executor/models/bart.py
vllm/model_executor/models/bart.py
+6
-6
vllm/model_executor/models/bert.py
vllm/model_executor/models/bert.py
+7
-5
vllm/model_executor/models/blip2.py
vllm/model_executor/models/blip2.py
+11
-9
vllm/model_executor/models/bloom.py
vllm/model_executor/models/bloom.py
+6
-4
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+7
-5
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+8
-7
vllm/model_executor/models/commandr.py
vllm/model_executor/models/commandr.py
+7
-5
vllm/model_executor/models/dbrx.py
vllm/model_executor/models/dbrx.py
+6
-4
vllm/model_executor/models/decilm.py
vllm/model_executor/models/decilm.py
+6
-12
vllm/model_executor/models/deepseek.py
vllm/model_executor/models/deepseek.py
+6
-4
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+6
-4
vllm/model_executor/models/eagle.py
vllm/model_executor/models/eagle.py
+4
-3
vllm/model_executor/models/exaone.py
vllm/model_executor/models/exaone.py
+7
-5
vllm/model_executor/models/falcon.py
vllm/model_executor/models/falcon.py
+6
-4
vllm/model_executor/models/florence2.py
vllm/model_executor/models/florence2.py
+5
-5
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/fuyu.py
+7
-8
No files found.
vllm/model_executor/model_loader/loader.py
View file @
1a95f10e
...
...
@@ -9,8 +9,7 @@ import math
import
os
from
abc
import
ABC
,
abstractmethod
from
contextlib
import
contextmanager
from
typing
import
(
Any
,
Dict
,
Generator
,
Iterable
,
List
,
Optional
,
Tuple
,
Type
,
cast
)
from
typing
import
Any
,
Dict
,
Generator
,
Iterable
,
List
,
Optional
,
Tuple
,
cast
import
gguf
import
huggingface_hub
...
...
@@ -18,20 +17,17 @@ import numpy as np
import
torch
from
huggingface_hub
import
HfApi
,
hf_hub_download
from
torch
import
nn
from
transformers
import
AutoModelForCausalLM
,
PretrainedConfig
from
transformers
import
AutoModelForCausalLM
from
transformers.utils
import
SAFE_WEIGHTS_INDEX_NAME
from
vllm.config
import
(
CacheConfig
,
LoadConfig
,
LoadFormat
,
LoRAConfig
,
ModelConfig
,
MultiModalConfig
,
ParallelConfig
,
PoolerConfig
,
SchedulerConfig
,
VllmConfig
)
from
vllm.config
import
(
LoadConfig
,
LoadFormat
,
ModelConfig
,
ParallelConfig
,
VllmConfig
)
from
vllm.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.envs
import
VLLM_USE_MODELSCOPE
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.linear
import
(
ReplicatedLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.model_loader.tensorizer
import
(
TensorizerConfig
,
is_vllm_tensorized
,
load_with_tensorizer
,
serialize_vllm_model
,
tensorizer_weights_iterator
)
...
...
@@ -43,8 +39,6 @@ from vllm.model_executor.model_loader.weight_utils import (
get_gguf_extra_tensor_names
,
gguf_quant_weights_iterator
,
initialize_dummy_weights
,
np_cache_weights_iterator
,
pt_weights_iterator
,
safetensors_weights_iterator
)
from
vllm.model_executor.models
import
(
has_inner_state
,
supports_lora
,
supports_multimodal
)
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.utils
import
is_pin_memory_available
...
...
@@ -94,85 +88,11 @@ def device_loading_context(module: torch.nn.Module,
logger
=
init_logger
(
__name__
)
def
_get_model_initialization_kwargs
(
model_class
:
Type
[
nn
.
Module
],
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
scheduler_config
:
Optional
[
SchedulerConfig
]
=
None
,
pooler_config
:
Optional
[
PoolerConfig
]
=
None
)
->
Dict
[
str
,
Any
]:
"""Get extra kwargs for model initialization."""
extra_kwargs
:
Dict
[
str
,
Any
]
=
{}
if
supports_lora
(
model_class
):
# lora_config=None is used to disable LoRA
extra_kwargs
[
"lora_config"
]
=
lora_config
elif
lora_config
:
raise
ValueError
(
f
"Model
{
model_class
.
__name__
}
does not support LoRA, "
"but LoRA is enabled. Support for this model may "
"be added in the future. If this is important to you, "
"please open an issue on github."
)
if
supports_multimodal
(
model_class
):
assert
multimodal_config
is
not
None
extra_kwargs
[
"multimodal_config"
]
=
multimodal_config
if
has_inner_state
(
model_class
)
and
scheduler_config
:
extra_kwargs
[
"scheduler_config"
]
=
scheduler_config
if
pooler_config
:
extra_kwargs
[
"pooler_config"
]
=
pooler_config
return
extra_kwargs
def
build_model
(
model_class
:
Type
[
nn
.
Module
],
vllm_config
:
Optional
[
VllmConfig
],
hf_config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
],
quant_config
:
Optional
[
QuantizationConfig
],
*
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
scheduler_config
:
Optional
[
SchedulerConfig
],
prefix
:
Optional
[
str
]
=
None
,
pooler_config
:
Optional
[
PoolerConfig
]
=
None
)
->
nn
.
Module
:
extra_kwargs
=
_get_model_initialization_kwargs
(
model_class
,
lora_config
,
multimodal_config
,
scheduler_config
,
pooler_config
)
if
prefix
:
extra_kwargs
[
"prefix"
]
=
prefix
# TODO: unify all the module initialization code
# to only take the `VllmConfig` object as input
from
vllm.plugins
import
set_vllm_config
set_vllm_config
(
vllm_config
)
return
model_class
(
config
=
hf_config
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
**
extra_kwargs
)
def
_initialize_model
(
vllm_config
:
VllmConfig
)
->
nn
.
Module
:
"""Initialize a model with the given configurations."""
model_config
=
vllm_config
.
model_config
lora_config
=
vllm_config
.
lora_config
scheduler_config
=
vllm_config
.
scheduler_config
cache_config
=
vllm_config
.
cache_config
model_class
,
_
=
get_model_architecture
(
model_config
)
return
build_model
(
model_class
,
vllm_config
,
model_config
.
hf_config
,
cache_config
=
cache_config
,
quant_config
=
vllm_config
.
quant_config
,
lora_config
=
lora_config
,
multimodal_config
=
model_config
.
multimodal_config
,
scheduler_config
=
scheduler_config
,
pooler_config
=
model_config
.
pooler_config
,
)
return
model_class
(
vllm_config
=
vllm_config
)
class
BaseModelLoader
(
ABC
):
...
...
@@ -486,24 +406,18 @@ class TensorizerLoader(BaseModelLoader):
device_config
=
vllm_config
.
device_config
model_config
=
vllm_config
.
model_config
lora_config
=
vllm_config
.
lora_config
cache_config
=
vllm_config
.
cache_config
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model_class
=
get_model_architecture
(
model_config
)[
0
]
quant_config
=
vllm_config
.
quant_config
extra_kwargs
=
_get_model_initialization_kwargs
(
model_class
,
lora_config
,
model_config
.
multimodal_config
)
extra_kwargs
[
"quant_config"
]
=
quant_config
extra_kwargs
[
"cache_config"
]
=
cache_config
tensorizer_config
=
copy
.
copy
(
self
.
tensorizer_config
)
tensorizer_config
.
model_class
=
model_class
tensorizer_config
.
hf_config
=
model_config
.
hf_config
tensorizer_config
.
dtype
=
model_config
.
dtype
model
=
load_with_tensorizer
(
tensorizer_config
,
**
extra_kwargs
)
model
=
load_with_tensorizer
(
tensorizer_config
,
vllm_config
=
vllm_config
)
return
model
.
eval
()
def
download_model
(
self
,
model_config
:
ModelConfig
)
->
None
:
...
...
vllm/model_executor/model_loader/tensorizer.py
View file @
1a95f10e
...
...
@@ -17,8 +17,6 @@ from vllm.config import ModelConfig, ParallelConfig
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
from
vllm.utils
import
FlexibleArgumentParser
...
...
@@ -268,8 +266,7 @@ class TensorizerAgent:
in vllm/model_executor/model_loader/weight_utils.py
"""
def
__init__
(
self
,
tensorizer_config
:
TensorizerConfig
,
quant_config
:
QuantizationConfig
,
**
extra_kwargs
):
def
__init__
(
self
,
tensorizer_config
:
TensorizerConfig
,
vllm_config
):
if
tensorizer_error_msg
is
not
None
:
raise
ImportError
(
"Tensorizer is not installed. Please install tensorizer "
...
...
@@ -279,11 +276,7 @@ class TensorizerAgent:
self
.
tensorizer_config
=
tensorizer_config
self
.
tensorizer_args
=
(
self
.
tensorizer_config
.
_construct_tensorizer_args
())
self
.
extra_kwargs
=
extra_kwargs
if
extra_kwargs
.
get
(
"quant_config"
)
is
not
None
:
self
.
quant_config
=
extra_kwargs
[
"quant_config"
]
else
:
self
.
quant_config
=
quant_config
self
.
vllm_config
=
vllm_config
self
.
model
=
self
.
_init_model
()
def
_init_model
(
self
):
...
...
@@ -293,9 +286,7 @@ class TensorizerAgent:
assert
self
.
tensorizer_config
.
model_class
is
not
None
with
no_init_or_tensor
():
return
self
.
tensorizer_config
.
model_class
(
config
=
model_args
,
quant_config
=
self
.
quant_config
,
**
self
.
extra_kwargs
)
vllm_config
=
self
.
vllm_config
,
)
def
_resize_lora_embeddings
(
self
):
"""Modify LoRA embedding layers to use bigger tensors
...
...
vllm/model_executor/models/arctic.py
View file @
1a95f10e
...
...
@@ -6,7 +6,7 @@ from torch import nn
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
)
...
...
@@ -415,14 +415,16 @@ class ArcticModel(nn.Module):
class
ArcticForCausalLM
(
nn
.
Module
,
SupportsPP
):
def
__init__
(
self
,
config
:
ArcticConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
**
kwargs
)
->
None
:
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
model
=
ArcticModel
(
config
,
cache_config
,
quant_config
)
self
.
model
=
ArcticModel
(
config
,
cache_config
,
quant_config
,
prefix
=
prefix
)
self
.
vocab_size
=
config
.
vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
vocab_size
,
...
...
vllm/model_executor/models/baichuan.py
View file @
1a95f10e
...
...
@@ -26,7 +26,7 @@ from transformers import PretrainedConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
LoRA
Config
from
vllm.config
import
CacheConfig
,
Vllm
Config
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
...
...
@@ -332,14 +332,15 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
__init__
(
self
,
config
:
PretrainedConfig
,
position_embedding
:
str
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
position_embedding
:
str
=
"ROPE"
,
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
...
...
@@ -439,17 +440,14 @@ class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
def
__init__
(
self
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
):
config
=
vllm_config
.
model_config
.
hf_config
if
config
.
hidden_size
==
4096
:
# baichuan2 7b
super
().
__init__
(
config
,
"ROPE"
,
cache_config
,
quant_config
,
lora_config
)
super
().
__init__
(
vllm_config
,
prefix
,
"ROPE"
)
else
:
# baichuan 13b, baichuan2 13b
super
().
__init__
(
config
,
"ALIBI"
,
cache_config
,
quant_config
,
lora_config
)
super
().
__init__
(
vllm_config
,
prefix
,
"ALIBI"
)
class
BaiChuanForCausalLM
(
BaiChuanBaseForCausalLM
):
...
...
@@ -459,10 +457,7 @@ class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):
def
__init__
(
self
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
):
super
().
__init__
(
config
,
"ROPE"
,
cache_config
,
quant_config
,
lora_config
)
super
().
__init__
(
vllm_config
,
prefix
,
"ROPE"
)
vllm/model_executor/models/bart.py
View file @
1a95f10e
...
...
@@ -25,7 +25,7 @@ from transformers import BartConfig
from
transformers.utils
import
logging
from
vllm.attention
import
Attention
,
AttentionMetadata
,
AttentionType
from
vllm.config
import
CacheConfig
,
LoRAConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
VllmConfig
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
@@ -810,13 +810,13 @@ class BartModel(nn.Module):
class
BartForConditionalGeneration
(
nn
.
Module
):
base_model_prefix
=
"model"
def
__init__
(
self
,
config
:
BartConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
):
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
# currently all existing BART models have `tie_word_embeddings` enabled
assert
config
.
tie_word_embeddings
self
.
config
=
config
...
...
vllm/model_executor/models/bert.py
View file @
1a95f10e
...
...
@@ -6,7 +6,7 @@ from transformers import BertConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
,
AttentionType
from
vllm.attention.backends.xformers
import
XFormersImpl
from
vllm.config
import
CacheConfig
,
Pooler
Config
from
vllm.config
import
CacheConfig
,
Vllm
Config
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
@@ -384,12 +384,14 @@ class BertEmbeddingModel(nn.Module):
def
__init__
(
self
,
config
:
BertConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
pooler_config
:
Optional
[
PoolerConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
pooler_config
=
vllm_config
.
model_config
.
pooler_config
self
.
model
=
BertModel
(
config
,
cache_config
,
quant_config
)
self
.
_pooler
=
Pooler
.
from_config_with_defaults
(
pooler_config
,
...
...
vllm/model_executor/models/blip2.py
View file @
1a95f10e
...
...
@@ -8,7 +8,7 @@ from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig,
apply_chunking_to_forward
)
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
CacheConfig
,
MultiModal
Config
from
vllm.config
import
CacheConfig
,
Vllm
Config
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
InputContext
,
token_inputs
)
from
vllm.model_executor.layers.activation
import
get_act_fn
...
...
@@ -483,14 +483,17 @@ def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs):
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_blip2
)
class
Blip2ForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
def
__init__
(
self
,
config
:
Blip2Config
,
multimodal_config
:
MultiModal
Config
,
cache_con
fi
g
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
)
->
None
:
def
__init__
(
self
,
vllm_config
:
Vllm
Config
,
pre
fi
x
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
self
.
config
=
config
self
.
multimodal_config
=
multimodal_config
...
...
@@ -513,8 +516,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
self
.
language_model
=
init_vllm_registered_model
(
config
.
text_config
,
cache_config
,
quant_config
,
vllm_config
=
vllm_config
,
prefix
=
"language_model"
)
self
.
make_empty_intermediate_tensors
=
(
...
...
vllm/model_executor/models/bloom.py
View file @
1a95f10e
...
...
@@ -24,7 +24,7 @@ from transformers import BloomConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.layers.activation
import
get_act_fn
...
...
@@ -283,11 +283,13 @@ class BloomForCausalLM(nn.Module, SupportsPP):
def
__init__
(
self
,
config
:
BloomConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
transformer
=
BloomModel
(
config
,
cache_config
,
quant_config
)
...
...
vllm/model_executor/models/chameleon.py
View file @
1a95f10e
...
...
@@ -9,7 +9,7 @@ from torch import nn
from
transformers
import
ChameleonConfig
,
ChameleonVQVAEConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
,
MultiModal
Config
from
vllm.config
import
CacheConfig
,
Vllm
Config
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
InputContext
,
token_inputs
)
...
...
@@ -926,12 +926,14 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
def
__init__
(
self
,
config
:
ChameleonConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
self
.
config
=
config
self
.
multimodal_config
=
multimodal_config
self
.
model
=
ChameleonModel
(
config
,
cache_config
,
quant_config
)
...
...
vllm/model_executor/models/chatglm.py
View file @
1a95f10e
...
...
@@ -11,7 +11,7 @@ from torch import nn
from
torch.nn
import
LayerNorm
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
MultiModal
Config
from
vllm.config
import
CacheConfig
,
Vllm
Config
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
InputContext
,
token_inputs
)
...
...
@@ -595,14 +595,15 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
def
__init__
(
self
,
config
:
ChatGLMConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
multimodal_config
=
multimodal_config
...
...
vllm/model_executor/models/commandr.py
View file @
1a95f10e
...
...
@@ -28,7 +28,7 @@ from transformers import CohereConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
LoRAConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
...
...
@@ -334,12 +334,14 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
__init__
(
self
,
config
:
CohereConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
# currently all existing command R models have `tie_word_embeddings`
# enabled
...
...
vllm/model_executor/models/dbrx.py
View file @
1a95f10e
...
...
@@ -4,7 +4,7 @@ import torch
import
torch.nn
as
nn
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
...
...
@@ -352,11 +352,13 @@ class DbrxForCausalLM(nn.Module, SupportsPP):
def
__init__
(
self
,
config
:
DbrxConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
if
config
.
tie_word_embeddings
:
raise
ValueError
(
...
...
vllm/model_executor/models/decilm.py
View file @
1a95f10e
...
...
@@ -22,13 +22,11 @@
# limitations under the License.
"""Inference-only DeciLM model compatible with HuggingFace weights."""
from
typing
import
Iterable
,
Optional
,
Tuple
from
typing
import
Iterable
,
Tuple
import
torch
from
transformers
import
LlamaConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.config
import
VllmConfig
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.llama
import
LlamaForCausalLM
...
...
@@ -55,17 +53,13 @@ class DeciLMForCausalLM(LlamaForCausalLM):
def
__init__
(
self
,
config
:
LlamaConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
config
=
vllm_config
.
model_config
.
hf_config
config
.
num_key_value_heads
=
max
(
config
.
num_key_value_heads_per_layer
)
delattr
(
config
,
"num_key_value_heads_per_layer"
)
super
().
__init__
(
config
=
config
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
lora_config
=
lora_config
)
super
().
__init__
(
vllm_config
=
vllm_config
)
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
stacked_params_mapping
=
[
...
...
vllm/model_executor/models/deepseek.py
View file @
1a95f10e
...
...
@@ -27,7 +27,7 @@ from torch import nn
from
transformers
import
PretrainedConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
)
...
...
@@ -385,11 +385,13 @@ class DeepseekForCausalLM(nn.Module, SupportsPP):
def
__init__
(
self
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
model
=
DeepseekModel
(
config
,
cache_config
,
quant_config
)
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
1a95f10e
...
...
@@ -28,7 +28,7 @@ from transformers import PretrainedConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
)
...
...
@@ -481,11 +481,13 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
def
__init__
(
self
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
model
=
DeepseekV2Model
(
config
,
...
...
vllm/model_executor/models/eagle.py
View file @
1a95f10e
...
...
@@ -4,6 +4,7 @@ import torch
import
torch.nn
as
nn
from
vllm.attention.backends.abstract
import
AttentionMetadata
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
...
...
@@ -12,7 +13,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs.eagle
import
EAGLEConfig
class
EAGLE
(
nn
.
Module
):
...
...
@@ -34,14 +34,15 @@ class EAGLE(nn.Module):
in the draft checkpoint (using key token_map). Also, the draft config
needs to have truncated_vocab_size (=k) as an attribute."""
def
__init__
(
self
,
config
:
EAGLE
Config
,
*
args
,
**
kwargs
)
->
None
:
def
__init__
(
self
,
vllm_
config
:
Vllm
Config
,
prefix
:
str
=
""
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
self
.
config
=
config
architectures
=
getattr
(
self
.
config
.
model
,
"architectures"
,
[])
model_cls
,
_
=
ModelRegistry
.
resolve_model_cls
(
architectures
)
self
.
model
=
model_cls
(
self
.
config
.
model
,
*
args
,
**
kwargs
)
self
.
model
=
model_cls
(
vllm_
config
,
prefix
)
self
.
fc
=
nn
.
Linear
(
config
.
model
.
hidden_size
*
2
,
config
.
model
.
hidden_size
,
bias
=
getattr
(
self
.
config
,
"eagle_fc_bias"
,
False
))
...
...
vllm/model_executor/models/exaone.py
View file @
1a95f10e
...
...
@@ -29,7 +29,7 @@ from torch import nn
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
LoRAConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
...
...
@@ -440,12 +440,14 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
__init__
(
self
,
config
:
ExaoneConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
...
...
vllm/model_executor/models/falcon.py
View file @
1a95f10e
...
...
@@ -27,7 +27,7 @@ from transformers import FalconConfig as HF_FalconConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
)
...
...
@@ -403,11 +403,13 @@ class FalconForCausalLM(nn.Module, SupportsPP):
def
__init__
(
self
,
config
:
FalconConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
transformer
=
FalconModel
(
config
,
cache_config
,
quant_config
)
...
...
vllm/model_executor/models/florence2.py
View file @
1a95f10e
...
...
@@ -6,7 +6,7 @@ import torch.nn as nn
from
transformers
import
PretrainedConfig
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
...
...
@@ -189,11 +189,11 @@ class Florence2LanguageForConditionalGeneration(nn.Module):
class
Florence2ForConditionalGeneration
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
):
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
# TODO(Isotr0py): Add vision backbone
self
.
language_model
=
Florence2LanguageForConditionalGeneration
(
...
...
vllm/model_executor/models/fuyu.py
View file @
1a95f10e
...
...
@@ -22,14 +22,13 @@ import torch
import
torch.nn
as
nn
import
torch.utils.checkpoint
from
PIL
import
Image
from
transformers
import
FuyuConfig
,
FuyuImageProcessor
from
transformers
import
FuyuImageProcessor
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
CacheConfig
,
MultiModal
Config
from
vllm.config
import
Vllm
Config
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
InputContext
,
token_inputs
)
from
vllm.model_executor.layers.linear
import
ColumnParallelLinear
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.models.persimmon
import
PersimmonForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
...
...
@@ -227,12 +226,12 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object):
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_fuyu
)
class
FuyuForCausalLM
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
def
__init__
(
self
,
config
:
FuyuConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
)
->
None
:
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
self
.
config
=
config
self
.
multimodal_config
=
multimodal_config
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment