Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1a95f10e
Unverified
Commit
1a95f10e
authored
Nov 08, 2024
by
youkaichao
Committed by
GitHub
Nov 09, 2024
Browse files
[5/N] pass the whole config to model (#9983)
Signed-off-by:
youkaichao
<
youkaichao@gmail.com
>
parent
49d2a41a
Changes
75
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
168 additions
and
144 deletions
+168
-144
vllm/model_executor/models/mamba.py
vllm/model_executor/models/mamba.py
+8
-6
vllm/model_executor/models/medusa.py
vllm/model_executor/models/medusa.py
+3
-2
vllm/model_executor/models/minicpm.py
vllm/model_executor/models/minicpm.py
+7
-5
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/minicpmv.py
+19
-29
vllm/model_executor/models/mixtral.py
vllm/model_executor/models/mixtral.py
+7
-6
vllm/model_executor/models/mixtral_quant.py
vllm/model_executor/models/mixtral_quant.py
+6
-4
vllm/model_executor/models/mllama.py
vllm/model_executor/models/mllama.py
+9
-6
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+8
-8
vllm/model_executor/models/mpt.py
vllm/model_executor/models/mpt.py
+7
-5
vllm/model_executor/models/nemotron.py
vllm/model_executor/models/nemotron.py
+7
-6
vllm/model_executor/models/olmo.py
vllm/model_executor/models/olmo.py
+9
-5
vllm/model_executor/models/olmoe.py
vllm/model_executor/models/olmoe.py
+6
-4
vllm/model_executor/models/opt.py
vllm/model_executor/models/opt.py
+7
-5
vllm/model_executor/models/orion.py
vllm/model_executor/models/orion.py
+6
-4
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/paligemma.py
+16
-14
vllm/model_executor/models/persimmon.py
vllm/model_executor/models/persimmon.py
+9
-5
vllm/model_executor/models/phi.py
vllm/model_executor/models/phi.py
+8
-7
vllm/model_executor/models/phi3_small.py
vllm/model_executor/models/phi3_small.py
+7
-6
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+12
-11
vllm/model_executor/models/phimoe.py
vllm/model_executor/models/phimoe.py
+7
-6
No files found.
vllm/model_executor/models/mamba.py
View file @
1a95f10e
...
...
@@ -6,7 +6,7 @@ from torch import nn
from
transformers
import
MambaConfig
from
vllm.attention.backends.abstract
import
AttentionMetadata
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
Scheduler
Config
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
Vllm
Config
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
...
...
@@ -132,12 +132,14 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
def
__init__
(
self
,
config
:
MambaConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
scheduler_config
:
Optional
[
SchedulerConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
scheduler_config
=
vllm_config
.
scheduler_config
assert
not
cache_config
.
enable_prefix_caching
,
\
"Mamba does not support prefix caching"
...
...
vllm/model_executor/models/medusa.py
View file @
1a95f10e
...
...
@@ -3,13 +3,13 @@ from typing import Iterable, List, Optional, Tuple
import
torch
import
torch.nn
as
nn
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.transformers_utils.configs.medusa
import
MedusaConfig
class
ResidualBlock
(
nn
.
Module
):
...
...
@@ -44,7 +44,8 @@ class Medusa(nn.Module):
in the draft checkpoint (using key token_map). Also, the draft config
needs to have truncated_vocab_size (=k) as an attribute."""
def
__init__
(
self
,
config
:
MedusaConfig
,
**
_
)
->
None
:
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
)
->
None
:
config
=
vllm_config
.
model_config
.
hf_config
super
().
__init__
()
self
.
config
=
config
self
.
blocks
=
nn
.
ModuleList
([
...
...
vllm/model_executor/models/minicpm.py
View file @
1a95f10e
...
...
@@ -29,7 +29,7 @@ from transformers import PretrainedConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
LoRAConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
)
...
...
@@ -463,12 +463,14 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
__init__
(
self
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
...
...
vllm/model_executor/models/minicpmv.py
View file @
1a95f10e
...
...
@@ -34,7 +34,7 @@ from transformers import PretrainedConfig
from
typing_extensions
import
NotRequired
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
MultiModal
Config
from
vllm.config
import
CacheConfig
,
Vllm
Config
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
InputContext
,
token_inputs
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
...
...
@@ -385,11 +385,13 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
def
__init__
(
self
,
config
:
PretrainedConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
):
config
=
vllm_config
.
model_config
.
hf_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
super
().
__init__
()
# All MiniCPM-V models disable `tie_word_embeddings` but
# `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
...
...
@@ -701,12 +703,10 @@ class MiniCPMV2_0(MiniCPMVBaseModel):
def
__init__
(
self
,
config
:
PretrainedConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
):
super
().
__init__
(
config
,
multimodal_config
,
cache_config
,
quant
_config
)
super
().
__init__
(
vllm
_config
)
assert
self
.
version
==
(
2
,
0
)
def
init_llm
(
...
...
@@ -867,13 +867,10 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
def
__init__
(
self
,
config
:
PretrainedConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
):
super
().
__init__
(
config
,
multimodal_config
,
cache_config
,
quant
_config
)
super
().
__init__
(
vllm
_config
)
assert
self
.
version
==
(
2
,
5
)
def
init_llm
(
...
...
@@ -1017,12 +1014,10 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
def
__init__
(
self
,
config
:
PretrainedConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
):
super
().
__init__
(
config
,
multimodal_config
,
cache_config
,
quant
_config
)
super
().
__init__
(
vllm
_config
)
assert
self
.
version
==
(
2
,
6
)
def
init_llm
(
...
...
@@ -1141,12 +1136,8 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA):
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__new__
(
cls
,
config
:
PretrainedConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
):
def
__new__
(
cls
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
config
=
vllm_config
.
model_config
.
hf_config
if
not
hasattr
(
config
,
"version"
):
if
config
.
hidden_size
==
2304
and
config
.
query_num
==
64
:
version
=
(
2
,
0
)
...
...
@@ -1160,5 +1151,4 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA):
if
instance_class
is
None
:
raise
ValueError
(
"Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6"
)
return
instance_class
(
config
,
multimodal_config
,
cache_config
,
quant_config
)
return
instance_class
(
vllm_config
,
prefix
=
prefix
)
vllm/model_executor/models/mixtral.py
View file @
1a95f10e
...
...
@@ -28,7 +28,7 @@ from transformers import MixtralConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
LoRAConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
...
...
@@ -334,13 +334,14 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
__init__
(
self
,
config
:
MixtralConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
...
...
vllm/model_executor/models/mixtral_quant.py
View file @
1a95f10e
...
...
@@ -29,7 +29,7 @@ from torch import nn
from
transformers
import
MixtralConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
)
...
...
@@ -352,11 +352,13 @@ class MixtralForCausalLM(nn.Module, SupportsPP):
def
__init__
(
self
,
config
:
MixtralConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
model
=
MixtralModel
(
config
,
cache_config
,
quant_config
)
...
...
vllm/model_executor/models/mllama.py
View file @
1a95f10e
...
...
@@ -33,7 +33,7 @@ from transformers.models.mllama.processing_mllama import (
import
vllm.distributed.parallel_state
as
ps
from
vllm.attention
import
Attention
,
AttentionMetadata
,
AttentionType
from
vllm.attention.ops.paged_attn
import
PagedAttention
from
vllm.config
import
CacheConfig
,
MultiModal
Config
from
vllm.config
import
CacheConfig
,
Vllm
Config
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DummyData
,
EncoderDecoderInputs
,
InputContext
,
TokenInputs
,
token_inputs
)
...
...
@@ -1108,12 +1108,15 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
"up_proj"
:
(
"gate_up_proj"
,
1
),
}
def
__init__
(
self
,
config
:
config_mllama
.
MllamaConfig
,
multimodal_config
:
MultiModal
Config
,
cache_con
fi
g
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
)
:
def
__init__
(
self
,
vllm_config
:
Vllm
Config
,
pre
fi
x
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
vocab_size
=
config
.
text_config
.
vocab_size
self
.
hidden_size
=
config
.
text_config
.
hidden_size
self
.
max_num_tiles
=
config
.
vision_config
.
max_num_tiles
...
...
vllm/model_executor/models/molmo.py
View file @
1a95f10e
...
...
@@ -3,8 +3,7 @@ import re
from
array
import
array
from
dataclasses
import
dataclass
from
functools
import
lru_cache
,
partial
from
typing
import
(
Any
,
Iterable
,
List
,
Mapping
,
Optional
,
Tuple
,
TypedDict
,
Union
)
from
typing
import
Iterable
,
List
,
Mapping
,
Optional
,
Tuple
,
TypedDict
,
Union
import
torch
from
einops
import
rearrange
...
...
@@ -16,7 +15,7 @@ from transformers import PretrainedConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.attention.selector
import
_Backend
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
MultiModal
Config
from
vllm.config
import
CacheConfig
,
Vllm
Config
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
split_tensor_along_last_dim
,
...
...
@@ -1027,13 +1026,14 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
def
__init__
(
self
,
config
:
PretrainedConfig
,
multimodal_config
:
Optional
[
MultiModalConfig
]
=
None
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
Mapping
[
str
,
Any
]]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
self
.
config
=
config
self
.
multimodal_config
=
multimodal_config
...
...
vllm/model_executor/models/mpt.py
View file @
1a95f10e
...
...
@@ -7,7 +7,7 @@ import torch.nn as nn
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.layers.activation
import
get_act_fn
...
...
@@ -269,11 +269,13 @@ class MPTForCausalLM(nn.Module, SupportsPP):
def
__init__
(
self
,
config
:
MPTConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
):
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
assert
config
.
tie_word_embeddings
self
.
quant_config
=
quant_config
...
...
vllm/model_executor/models/nemotron.py
View file @
1a95f10e
...
...
@@ -27,7 +27,7 @@ from torch import nn
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
LoRAConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
@@ -403,13 +403,14 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
__init__
(
self
,
config
:
NemotronConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
assert
isinstance
(
config
,
NemotronConfig
)
self
.
config
=
config
...
...
vllm/model_executor/models/olmo.py
View file @
1a95f10e
...
...
@@ -28,7 +28,7 @@ from transformers import OlmoConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
...
...
@@ -291,11 +291,15 @@ class OlmoForCausalLM(nn.Module, SupportsPP):
Extremely barebones HF model wrapper.
"""
def
__init__
(
self
,
config
:
OlmoConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
):
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
model
=
OlmoModel
(
config
,
cache_config
,
quant_config
)
if
config
.
tie_word_embeddings
:
...
...
vllm/model_executor/models/olmoe.py
View file @
1a95f10e
...
...
@@ -18,7 +18,7 @@ from transformers import PretrainedConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
...
...
@@ -311,11 +311,13 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
def
__init__
(
self
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
model
=
OlmoeModel
(
config
,
cache_config
,
quant_config
)
...
...
vllm/model_executor/models/opt.py
View file @
1a95f10e
...
...
@@ -24,7 +24,7 @@ from transformers import OPTConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
@@ -344,11 +344,13 @@ class OPTForCausalLM(nn.Module, SupportsPP):
def
__init__
(
self
,
config
:
OPTConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
):
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
super
().
__init__
()
self
.
config
=
config
self
.
quant_config
=
quant_config
...
...
vllm/model_executor/models/orion.py
View file @
1a95f10e
...
...
@@ -11,7 +11,7 @@ from transformers import PretrainedConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
...
...
@@ -270,11 +270,13 @@ class OrionForCausalLM(nn.Module, SupportsPP):
def
__init__
(
self
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
model
=
OrionModel
(
config
,
cache_config
,
quant_config
)
...
...
vllm/model_executor/models/paligemma.py
View file @
1a95f10e
...
...
@@ -6,13 +6,11 @@ from torch import nn
from
transformers
import
PaliGemmaConfig
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
CacheConfig
,
MultiModal
Config
from
vllm.config
import
Vllm
Config
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
InputContext
,
token_inputs
)
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.models.gemma
import
GemmaForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.utils
import
cached_get_tokenizer
...
...
@@ -21,7 +19,8 @@ from vllm.sequence import IntermediateTensors
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.siglip
import
(
SiglipVisionModel
,
dummy_image_for_siglip
,
dummy_seq_data_for_siglip
,
get_max_siglip_image_tokens
)
from
.utils
import
AutoWeightsLoader
,
merge_multimodal_embeddings
from
.utils
import
(
AutoWeightsLoader
,
init_vllm_registered_model
,
merge_multimodal_embeddings
)
logger
=
init_logger
(
__name__
)
...
...
@@ -132,13 +131,15 @@ class PaliGemmaMultiModalProjector(nn.Module):
class
PaliGemmaForConditionalGeneration
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
def
__init__
(
self
,
config
:
PaliGemmaConfig
,
multimodal_config
:
MultiModal
Config
,
cache_con
fi
g
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
)
->
None
:
def
__init__
(
self
,
vllm_config
:
Vllm
Config
,
pre
fi
x
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
self
.
config
=
config
self
.
multimodal_config
=
multimodal_config
...
...
@@ -150,10 +151,11 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
projection_dim
=
config
.
vision_config
.
projection_dim
)
self
.
quant_config
=
quant_config
self
.
language_model
=
GemmaForCausalLM
(
config
.
text_config
,
cache_config
,
quant_config
,
prefix
=
"language_model"
)
config
.
text_config
.
architectures
=
[
"GemmaForCausalLM"
]
self
.
language_model
=
init_vllm_registered_model
(
config
.
text_config
,
vllm_config
=
vllm_config
,
prefix
=
"language_model"
)
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
language_model
.
logits_processor
.
scale
*=
logit_scale
...
...
vllm/model_executor/models/persimmon.py
View file @
1a95f10e
...
...
@@ -27,7 +27,7 @@ from transformers import PersimmonConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
@@ -265,11 +265,15 @@ class PersimmonModel(nn.Module):
class
PersimmonForCausalLM
(
nn
.
Module
,
SupportsPP
):
def
__init__
(
self
,
config
:
PersimmonConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
):
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
vocab_size
=
config
.
vocab_size
self
.
model
=
PersimmonModel
(
config
,
...
...
vllm/model_executor/models/phi.py
View file @
1a95f10e
...
...
@@ -42,7 +42,7 @@ from transformers import PhiConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
LoRA
Config
from
vllm.config
import
CacheConfig
,
Vllm
Config
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
...
...
@@ -279,13 +279,14 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
__init__
(
self
,
config
:
PhiConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
):
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
# lm_head use bias, cannot share word embeddings
assert
not
config
.
tie_word_embeddings
...
...
vllm/model_executor/models/phi3_small.py
View file @
1a95f10e
...
...
@@ -6,7 +6,7 @@ from torch import nn
from
transformers.configuration_utils
import
PretrainedConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
,
LoRA
Config
from
vllm.config
import
CacheConfig
,
Vllm
Config
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
...
...
@@ -365,12 +365,13 @@ class Phi3SmallForCausalLM(nn.Module, SupportsPP):
def
__init__
(
self
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
):
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
model
=
Phi3SmallModel
(
config
,
cache_config
,
quant_config
)
...
...
vllm/model_executor/models/phi3v.py
View file @
1a95f10e
...
...
@@ -25,8 +25,7 @@ from PIL import Image
from
transformers
import
CLIPVisionConfig
,
PretrainedConfig
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
(
CacheConfig
,
ModelConfig
,
MultiModalConfig
,
PoolerConfig
)
from
vllm.config
import
ModelConfig
,
VllmConfig
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
InputContext
,
token_inputs
)
from
vllm.logger
import
init_logger
...
...
@@ -526,14 +525,16 @@ def input_processor_for_phi3v(ctx: InputContext,
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_phi3v
)
class
Phi3VForCausalLM
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
pooler_config
:
Optional
[
PoolerConfig
]
=
None
)
->
None
:
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
pooler_config
=
vllm_config
.
model_config
.
pooler_config
multimodal_config
=
vllm_config
.
model_config
.
multimodal_config
self
.
config
=
config
self
.
multimodal_config
=
multimodal_config
self
.
image_token_id
=
_IMAGE_TOKEN_ID
...
...
@@ -552,8 +553,8 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
# The prefix is empty intentionally because default prefix of
# LlamaForCausalLM is "model"
self
.
language_model
=
LlamaForCausalLM
(
config
,
cache
_config
,
quant_config
)
self
.
language_model
=
LlamaForCausalLM
(
vllm_
config
=
vllm
_config
,
prefix
=
""
)
# The same model class supports both language generation and embedding
# because the architecture name is the same
...
...
vllm/model_executor/models/phimoe.py
View file @
1a95f10e
...
...
@@ -28,7 +28,7 @@ from transformers.configuration_utils import PretrainedConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
LoRAConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
VllmConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.linear
import
(
QKVParallelLinear
,
...
...
@@ -531,13 +531,14 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def
__init__
(
self
,
config
:
PhiMoEConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
lora_config
=
vllm_config
.
lora_config
self
.
config
=
config
self
.
lora_config
=
lora_config
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment