Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0da696a7
"csrc/vscode:/vscode.git/clone" did not exist on "230b131b54e8ad4ee9086a15c69b29b387ddb3b0"
Commit
0da696a7
authored
Jan 20, 2026
by
王敏
Browse files
Merge remote-tracking branch 'origin/v0.11.0-dev' into v0.11.0-dev
parents
82c0bf76
6fa116fb
Changes
31
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
385 additions
and
16 deletions
+385
-16
vllm/config/model.py
vllm/config/model.py
+6
-2
vllm/config/multimodal.py
vllm/config/multimodal.py
+7
-0
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+5
-0
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+20
-0
vllm/entrypoints/renderer.py
vllm/entrypoints/renderer.py
+5
-2
vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=gfx928_120cu_nn.json
..._moe/configs/E=512,N=128,device_name=gfx928_120cu_nn.json
+165
-0
vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=gfx928_120cu_nn.json
...d_moe/configs/E=512,N=64,device_name=gfx928_120cu_nn.json
+165
-0
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+2
-2
vllm/multimodal/processing.py
vllm/multimodal/processing.py
+10
-0
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+0
-6
vllm/utils/__init__.py
vllm/utils/__init__.py
+0
-4
No files found.
vllm/config/model.py
View file @
0da696a7
...
@@ -210,8 +210,9 @@ class ModelConfig:
...
@@ -210,8 +210,9 @@ class ModelConfig:
output will contain token ids."""
output will contain token ids."""
enable_prompt_embeds
:
bool
=
False
enable_prompt_embeds
:
bool
=
False
"""If `True`, enables passing text embeddings as inputs via the
"""If `True`, enables passing text embeddings as inputs via the
`prompt_embeds` key. Note that enabling this will double the time required
`prompt_embeds` key.
for graph compilation."""
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!"""
served_model_name
:
Optional
[
Union
[
str
,
list
[
str
]]]
=
None
served_model_name
:
Optional
[
Union
[
str
,
list
[
str
]]]
=
None
"""The model name(s) used in the API. If multiple names are provided, the
"""The model name(s) used in the API. If multiple names are provided, the
server will respond to any of the provided names. The model name in the
server will respond to any of the provided names. The model name in the
...
@@ -284,6 +285,7 @@ class ModelConfig:
...
@@ -284,6 +285,7 @@ class ModelConfig:
"""Configuration for multimodal model. If `None`, this will be inferred
"""Configuration for multimodal model. If `None`, this will be inferred
from the architecture of `self.model`."""
from the architecture of `self.model`."""
limit_mm_per_prompt
:
InitVar
[
Optional
[
dict
[
str
,
int
]]]
=
None
limit_mm_per_prompt
:
InitVar
[
Optional
[
dict
[
str
,
int
]]]
=
None
enable_mm_embeds
:
InitVar
[
bool
|
None
]
=
None
media_io_kwargs
:
InitVar
[
Optional
[
dict
[
str
,
dict
[
str
,
Any
]]]]
=
None
media_io_kwargs
:
InitVar
[
Optional
[
dict
[
str
,
dict
[
str
,
Any
]]]]
=
None
mm_processor_kwargs
:
InitVar
[
Optional
[
dict
[
str
,
Any
]]]
=
None
mm_processor_kwargs
:
InitVar
[
Optional
[
dict
[
str
,
Any
]]]
=
None
mm_processor_cache_gb
:
InitVar
[
Optional
[
float
]]
=
None
mm_processor_cache_gb
:
InitVar
[
Optional
[
float
]]
=
None
...
@@ -353,6 +355,7 @@ class ModelConfig:
...
@@ -353,6 +355,7 @@ class ModelConfig:
self
,
self
,
# Multimodal config init vars
# Multimodal config init vars
limit_mm_per_prompt
:
Optional
[
dict
[
str
,
int
]],
limit_mm_per_prompt
:
Optional
[
dict
[
str
,
int
]],
enable_mm_embeds
:
bool
|
None
,
media_io_kwargs
:
Optional
[
dict
[
str
,
dict
[
str
,
Any
]]],
media_io_kwargs
:
Optional
[
dict
[
str
,
dict
[
str
,
Any
]]],
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]],
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]],
mm_processor_cache_gb
:
Optional
[
float
],
mm_processor_cache_gb
:
Optional
[
float
],
...
@@ -618,6 +621,7 @@ class ModelConfig:
...
@@ -618,6 +621,7 @@ class ModelConfig:
mm_config_kwargs
=
dict
(
mm_config_kwargs
=
dict
(
limit_per_prompt
=
limit_mm_per_prompt
,
limit_per_prompt
=
limit_mm_per_prompt
,
enable_mm_embeds
=
enable_mm_embeds
,
media_io_kwargs
=
media_io_kwargs
,
media_io_kwargs
=
media_io_kwargs
,
mm_processor_kwargs
=
mm_processor_kwargs
,
mm_processor_kwargs
=
mm_processor_kwargs
,
mm_processor_cache_gb
=
mm_processor_cache_gb
,
mm_processor_cache_gb
=
mm_processor_cache_gb
,
...
...
vllm/config/multimodal.py
View file @
0da696a7
...
@@ -26,6 +26,13 @@ class MultiModalConfig:
...
@@ -26,6 +26,13 @@ class MultiModalConfig:
For example, to allow up to 16 images and 2 videos per prompt:
For example, to allow up to 16 images and 2 videos per prompt:
`{"image": 16, "video": 2}`"""
`{"image": 16, "video": 2}`"""
enable_mm_embeds
:
bool
=
False
"""If `True`, enables passing multimodal embeddings:
for `LLM` class, this refers to tensor inputs under `multi_modal_data`;
for the OpenAI-compatible server, this refers to chat messages with content
`"type": "*_embeds"`.
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!"""
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
field
(
default_factory
=
dict
)
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
field
(
default_factory
=
dict
)
"""Additional args passed to process media inputs, keyed by modalities.
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
For example, to set num_frames for video, set
...
...
vllm/engine/arg_utils.py
View file @
0da696a7
...
@@ -379,6 +379,7 @@ class EngineArgs:
...
@@ -379,6 +379,7 @@ class EngineArgs:
disable_custom_all_reduce
:
bool
=
ParallelConfig
.
disable_custom_all_reduce
disable_custom_all_reduce
:
bool
=
ParallelConfig
.
disable_custom_all_reduce
limit_mm_per_prompt
:
dict
[
str
,
int
]
=
\
limit_mm_per_prompt
:
dict
[
str
,
int
]
=
\
get_field
(
MultiModalConfig
,
"limit_per_prompt"
)
get_field
(
MultiModalConfig
,
"limit_per_prompt"
)
enable_mm_embeds
:
bool
=
MultiModalConfig
.
enable_mm_embeds
interleave_mm_strings
:
bool
=
MultiModalConfig
.
interleave_mm_strings
interleave_mm_strings
:
bool
=
MultiModalConfig
.
interleave_mm_strings
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
get_field
(
MultiModalConfig
,
Any
]]
=
get_field
(
MultiModalConfig
,
...
@@ -796,6 +797,9 @@ class EngineArgs:
...
@@ -796,6 +797,9 @@ class EngineArgs:
)
)
multimodal_group
.
add_argument
(
"--limit-mm-per-prompt"
,
multimodal_group
.
add_argument
(
"--limit-mm-per-prompt"
,
**
multimodal_kwargs
[
"limit_per_prompt"
])
**
multimodal_kwargs
[
"limit_per_prompt"
])
multimodal_group
.
add_argument
(
"--enable-mm-embeds"
,
**
multimodal_kwargs
[
"enable_mm_embeds"
]
)
multimodal_group
.
add_argument
(
"--media-io-kwargs"
,
multimodal_group
.
add_argument
(
"--media-io-kwargs"
,
**
multimodal_kwargs
[
"media_io_kwargs"
])
**
multimodal_kwargs
[
"media_io_kwargs"
])
multimodal_group
.
add_argument
(
multimodal_group
.
add_argument
(
...
@@ -1034,6 +1038,7 @@ class EngineArgs:
...
@@ -1034,6 +1038,7 @@ class EngineArgs:
enable_prompt_embeds
=
self
.
enable_prompt_embeds
,
enable_prompt_embeds
=
self
.
enable_prompt_embeds
,
served_model_name
=
self
.
served_model_name
,
served_model_name
=
self
.
served_model_name
,
limit_mm_per_prompt
=
self
.
limit_mm_per_prompt
,
limit_mm_per_prompt
=
self
.
limit_mm_per_prompt
,
enable_mm_embeds
=
self
.
enable_mm_embeds
,
interleave_mm_strings
=
self
.
interleave_mm_strings
,
interleave_mm_strings
=
self
.
interleave_mm_strings
,
media_io_kwargs
=
self
.
media_io_kwargs
,
media_io_kwargs
=
self
.
media_io_kwargs
,
skip_mm_profiling
=
self
.
skip_mm_profiling
,
skip_mm_profiling
=
self
.
skip_mm_profiling
,
...
...
vllm/entrypoints/chat_utils.py
View file @
0da696a7
...
@@ -845,6 +845,10 @@ class MultiModalContentParser(BaseMultiModalContentParser):
...
@@ -845,6 +845,10 @@ class MultiModalContentParser(BaseMultiModalContentParser):
allowed_media_domains
=
tracker
.
allowed_media_domains
,
allowed_media_domains
=
tracker
.
allowed_media_domains
,
)
)
@
property
def
model_config
(
self
)
->
ModelConfig
:
return
self
.
_tracker
.
model_config
def
parse_image
(
def
parse_image
(
self
,
image_url
:
Optional
[
str
],
uuid
:
Optional
[
str
]
=
None
self
,
image_url
:
Optional
[
str
],
uuid
:
Optional
[
str
]
=
None
)
->
None
:
)
->
None
:
...
@@ -858,6 +862,12 @@ class MultiModalContentParser(BaseMultiModalContentParser):
...
@@ -858,6 +862,12 @@ class MultiModalContentParser(BaseMultiModalContentParser):
image_embeds
:
Union
[
str
,
dict
[
str
,
str
],
None
],
image_embeds
:
Union
[
str
,
dict
[
str
,
str
],
None
],
uuid
:
Optional
[
str
]
=
None
,
uuid
:
Optional
[
str
]
=
None
,
)
->
None
:
)
->
None
:
mm_config
=
self
.
model_config
.
get_multimodal_config
()
if
not
mm_config
.
enable_mm_embeds
:
raise
ValueError
(
"You must set `--enable-mm-embeds` to input `image_embeds`"
)
if
isinstance
(
image_embeds
,
dict
):
if
isinstance
(
image_embeds
,
dict
):
embeds
=
{
embeds
=
{
k
:
self
.
_connector
.
fetch_image_embedding
(
v
)
k
:
self
.
_connector
.
fetch_image_embedding
(
v
)
...
@@ -930,6 +940,10 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
...
@@ -930,6 +940,10 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
allowed_media_domains
=
tracker
.
allowed_media_domains
,
allowed_media_domains
=
tracker
.
allowed_media_domains
,
)
)
@
property
def
model_config
(
self
)
->
ModelConfig
:
return
self
.
_tracker
.
model_config
def
parse_image
(
def
parse_image
(
self
,
image_url
:
Optional
[
str
],
uuid
:
Optional
[
str
]
=
None
self
,
image_url
:
Optional
[
str
],
uuid
:
Optional
[
str
]
=
None
)
->
None
:
)
->
None
:
...
@@ -945,6 +959,12 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
...
@@ -945,6 +959,12 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
image_embeds
:
Union
[
str
,
dict
[
str
,
str
],
None
],
image_embeds
:
Union
[
str
,
dict
[
str
,
str
],
None
],
uuid
:
Optional
[
str
]
=
None
,
uuid
:
Optional
[
str
]
=
None
,
)
->
None
:
)
->
None
:
mm_config
=
self
.
model_config
.
get_multimodal_config
()
if
not
mm_config
.
enable_mm_embeds
:
raise
ValueError
(
"You must set `--enable-mm-embeds` to input `image_embeds`"
)
future
:
asyncio
.
Future
[
Union
[
str
,
dict
[
str
,
str
],
None
]]
=
(
future
:
asyncio
.
Future
[
Union
[
str
,
dict
[
str
,
str
],
None
]]
=
(
asyncio
.
Future
()
asyncio
.
Future
()
)
)
...
...
vllm/entrypoints/renderer.py
View file @
0da696a7
...
@@ -135,14 +135,17 @@ class BaseRenderer(ABC):
...
@@ -135,14 +135,17 @@ class BaseRenderer(ABC):
"""
"""
raise
NotImplementedError
raise
NotImplementedError
@
classmethod
def
load_prompt_embeds
(
def
load_prompt_embeds
(
cls
,
self
,
prompt_embeds
:
Union
[
bytes
,
list
[
bytes
]],
prompt_embeds
:
Union
[
bytes
,
list
[
bytes
]],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
0
)]]
=
None
,
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
0
)]]
=
None
,
cache_salt
:
Optional
[
str
]
=
None
,
cache_salt
:
Optional
[
str
]
=
None
,
)
->
list
[
EngineEmbedsPrompt
]:
)
->
list
[
EngineEmbedsPrompt
]:
"""Load and validate base64-encoded embeddings into prompt objects."""
"""Load and validate base64-encoded embeddings into prompt objects."""
if
not
self
.
model_config
.
enable_prompt_embeds
:
raise
ValueError
(
"You must set `--enable-prompt-embeds` to input `prompt_embeds`."
)
def
_load_and_validate_embed
(
embed
:
bytes
)
->
EngineEmbedsPrompt
:
def
_load_and_validate_embed
(
embed
:
bytes
)
->
EngineEmbedsPrompt
:
tensor
=
torch
.
load
(
tensor
=
torch
.
load
(
...
...
vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=gfx928_120cu_nn.json
0 → 100644
View file @
0da696a7
{
"triton_version"
:
"3.1.0"
,
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"48"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"96"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
5
,
"num_ldmatrixes"
:
1
},
"1536"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"3072"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
}
}
vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=gfx928_120cu_nn.json
0 → 100644
View file @
0da696a7
{
"triton_version"
:
"3.1.0"
,
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"48"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"96"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1536"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"3072"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
}
}
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
0da696a7
...
@@ -1363,14 +1363,14 @@ def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
...
@@ -1363,14 +1363,14 @@ def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
token_expert_indices
:
torch
.
Tensor
,
token_expert_indices
:
torch
.
Tensor
,
gating_output
:
torch
.
Tensor
,
gating_output
:
torch
.
Tensor
,
renormalize
:
bool
)
->
tuple
[
torch
.
Tensor
,
...]:
renormalize
:
bool
)
->
tuple
[
torch
.
Tensor
,
...]:
if
envs
.
VLLM_USE_TOPK_RENORM
:
if
envs
.
VLLM_USE_TOPK_RENORM
and
renormalize
is
True
:
from
lightop
import
op
as
op
from
lightop
import
op
as
op
op
.
topk_softmax
(
op
.
topk_softmax
(
topk_weights
,
topk_weights
,
topk_indices
,
topk_indices
,
token_expert_indices
,
token_expert_indices
,
gating_output
,
gating_output
,
Tru
e
,
renormaliz
e
,
)
)
else
:
else
:
ops
.
topk_softmax
(
ops
.
topk_softmax
(
...
...
vllm/multimodal/processing.py
View file @
0da696a7
...
@@ -1296,6 +1296,16 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
...
@@ -1296,6 +1296,16 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
[`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
[`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
"""
"""
mm_items
=
self
.
data_parser
.
parse_mm_data
(
mm_data
)
mm_items
=
self
.
data_parser
.
parse_mm_data
(
mm_data
)
mm_config
=
self
.
info
.
ctx
.
model_config
.
get_multimodal_config
()
if
not
mm_config
.
enable_mm_embeds
:
for
modality
,
items
in
mm_items
.
items
():
if
isinstance
(
items
,
(
EmbeddingItems
,
DictEmbeddingItems
)):
raise
ValueError
(
f
"You must set `--enable-mm-embeds` to input "
f
"`
{
modality
}
_embeds`"
)
for
modality
,
items
in
mm_items
.
items
():
for
modality
,
items
in
mm_items
.
items
():
self
.
validate_num_items
(
modality
,
len
(
items
))
self
.
validate_num_items
(
modality
,
len
(
items
))
...
...
vllm/platforms/rocm.py
View file @
0da696a7
...
@@ -16,12 +16,6 @@ from vllm.utils import cuda_device_count_stateless
...
@@ -16,12 +16,6 @@ from vllm.utils import cuda_device_count_stateless
from
.interface
import
DeviceCapability
,
Platform
,
PlatformEnum
,
_Backend
from
.interface
import
DeviceCapability
,
Platform
,
PlatformEnum
,
_Backend
from
vllm.utils
import
SUPPORT_MOE_MARLIN_W16A16
if
SUPPORT_MOE_MARLIN_W16A16
:
os
.
environ
[
'VLLM_USE_MARLIN_W16A16_MOE'
]
=
'1'
os
.
environ
[
'MOE_NN'
]
=
'0'
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
,
VllmConfig
from
vllm.config
import
ModelConfig
,
VllmConfig
...
...
vllm/utils/__init__.py
View file @
0da696a7
...
@@ -86,10 +86,6 @@ if TYPE_CHECKING:
...
@@ -86,10 +86,6 @@ if TYPE_CHECKING:
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
GPU_ARCH
=
torch
.
cuda
.
get_device_properties
(
"cuda"
).
gcnArchName
SUPPORT_MOE_MARLIN_W16A16
=
any
(
arch
in
GPU_ARCH
for
arch
in
[
"gfx936"
])
# This value is chosen to have a balance between ITL and TTFT. Note it is
# This value is chosen to have a balance between ITL and TTFT. Note it is
# not optimized for throughput.
# not optimized for throughput.
DEFAULT_MAX_NUM_BATCHED_TOKENS
=
2048
DEFAULT_MAX_NUM_BATCHED_TOKENS
=
2048
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment