Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0da696a7
Commit
0da696a7
authored
Jan 20, 2026
by
王敏
Browse files
Merge remote-tracking branch 'origin/v0.11.0-dev' into v0.11.0-dev
parents
82c0bf76
6fa116fb
Changes
31
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
385 additions
and
16 deletions
+385
-16
vllm/config/model.py
vllm/config/model.py
+6
-2
vllm/config/multimodal.py
vllm/config/multimodal.py
+7
-0
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+5
-0
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+20
-0
vllm/entrypoints/renderer.py
vllm/entrypoints/renderer.py
+5
-2
vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=gfx928_120cu_nn.json
..._moe/configs/E=512,N=128,device_name=gfx928_120cu_nn.json
+165
-0
vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=gfx928_120cu_nn.json
...d_moe/configs/E=512,N=64,device_name=gfx928_120cu_nn.json
+165
-0
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+2
-2
vllm/multimodal/processing.py
vllm/multimodal/processing.py
+10
-0
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+0
-6
vllm/utils/__init__.py
vllm/utils/__init__.py
+0
-4
No files found.
vllm/config/model.py
View file @
0da696a7
...
...
@@ -210,8 +210,9 @@ class ModelConfig:
output will contain token ids."""
enable_prompt_embeds
:
bool
=
False
"""If `True`, enables passing text embeddings as inputs via the
`prompt_embeds` key. Note that enabling this will double the time required
for graph compilation."""
`prompt_embeds` key.
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!"""
served_model_name
:
Optional
[
Union
[
str
,
list
[
str
]]]
=
None
"""The model name(s) used in the API. If multiple names are provided, the
server will respond to any of the provided names. The model name in the
...
...
@@ -284,6 +285,7 @@ class ModelConfig:
"""Configuration for multimodal model. If `None`, this will be inferred
from the architecture of `self.model`."""
limit_mm_per_prompt
:
InitVar
[
Optional
[
dict
[
str
,
int
]]]
=
None
enable_mm_embeds
:
InitVar
[
bool
|
None
]
=
None
media_io_kwargs
:
InitVar
[
Optional
[
dict
[
str
,
dict
[
str
,
Any
]]]]
=
None
mm_processor_kwargs
:
InitVar
[
Optional
[
dict
[
str
,
Any
]]]
=
None
mm_processor_cache_gb
:
InitVar
[
Optional
[
float
]]
=
None
...
...
@@ -353,6 +355,7 @@ class ModelConfig:
self
,
# Multimodal config init vars
limit_mm_per_prompt
:
Optional
[
dict
[
str
,
int
]],
enable_mm_embeds
:
bool
|
None
,
media_io_kwargs
:
Optional
[
dict
[
str
,
dict
[
str
,
Any
]]],
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]],
mm_processor_cache_gb
:
Optional
[
float
],
...
...
@@ -618,6 +621,7 @@ class ModelConfig:
mm_config_kwargs
=
dict
(
limit_per_prompt
=
limit_mm_per_prompt
,
enable_mm_embeds
=
enable_mm_embeds
,
media_io_kwargs
=
media_io_kwargs
,
mm_processor_kwargs
=
mm_processor_kwargs
,
mm_processor_cache_gb
=
mm_processor_cache_gb
,
...
...
vllm/config/multimodal.py
View file @
0da696a7
...
...
@@ -26,6 +26,13 @@ class MultiModalConfig:
For example, to allow up to 16 images and 2 videos per prompt:
`{"image": 16, "video": 2}`"""
enable_mm_embeds
:
bool
=
False
"""If `True`, enables passing multimodal embeddings:
for `LLM` class, this refers to tensor inputs under `multi_modal_data`;
for the OpenAI-compatible server, this refers to chat messages with content
`"type": "*_embeds"`.
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!"""
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
field
(
default_factory
=
dict
)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
...
...
vllm/engine/arg_utils.py
View file @
0da696a7
...
...
@@ -379,6 +379,7 @@ class EngineArgs:
disable_custom_all_reduce
:
bool
=
ParallelConfig
.
disable_custom_all_reduce
limit_mm_per_prompt
:
dict
[
str
,
int
]
=
\
get_field
(
MultiModalConfig
,
"limit_per_prompt"
)
enable_mm_embeds
:
bool
=
MultiModalConfig
.
enable_mm_embeds
interleave_mm_strings
:
bool
=
MultiModalConfig
.
interleave_mm_strings
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
get_field
(
MultiModalConfig
,
...
...
@@ -796,6 +797,9 @@ class EngineArgs:
)
multimodal_group
.
add_argument
(
"--limit-mm-per-prompt"
,
**
multimodal_kwargs
[
"limit_per_prompt"
])
multimodal_group
.
add_argument
(
"--enable-mm-embeds"
,
**
multimodal_kwargs
[
"enable_mm_embeds"
]
)
multimodal_group
.
add_argument
(
"--media-io-kwargs"
,
**
multimodal_kwargs
[
"media_io_kwargs"
])
multimodal_group
.
add_argument
(
...
...
@@ -1034,6 +1038,7 @@ class EngineArgs:
enable_prompt_embeds
=
self
.
enable_prompt_embeds
,
served_model_name
=
self
.
served_model_name
,
limit_mm_per_prompt
=
self
.
limit_mm_per_prompt
,
enable_mm_embeds
=
self
.
enable_mm_embeds
,
interleave_mm_strings
=
self
.
interleave_mm_strings
,
media_io_kwargs
=
self
.
media_io_kwargs
,
skip_mm_profiling
=
self
.
skip_mm_profiling
,
...
...
vllm/entrypoints/chat_utils.py
View file @
0da696a7
...
...
@@ -844,6 +844,10 @@ class MultiModalContentParser(BaseMultiModalContentParser):
allowed_local_media_path
=
tracker
.
allowed_local_media_path
,
allowed_media_domains
=
tracker
.
allowed_media_domains
,
)
@
property
def
model_config
(
self
)
->
ModelConfig
:
return
self
.
_tracker
.
model_config
def
parse_image
(
self
,
image_url
:
Optional
[
str
],
uuid
:
Optional
[
str
]
=
None
...
...
@@ -858,6 +862,12 @@ class MultiModalContentParser(BaseMultiModalContentParser):
image_embeds
:
Union
[
str
,
dict
[
str
,
str
],
None
],
uuid
:
Optional
[
str
]
=
None
,
)
->
None
:
mm_config
=
self
.
model_config
.
get_multimodal_config
()
if
not
mm_config
.
enable_mm_embeds
:
raise
ValueError
(
"You must set `--enable-mm-embeds` to input `image_embeds`"
)
if
isinstance
(
image_embeds
,
dict
):
embeds
=
{
k
:
self
.
_connector
.
fetch_image_embedding
(
v
)
...
...
@@ -929,6 +939,10 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
allowed_local_media_path
=
tracker
.
allowed_local_media_path
,
allowed_media_domains
=
tracker
.
allowed_media_domains
,
)
@
property
def
model_config
(
self
)
->
ModelConfig
:
return
self
.
_tracker
.
model_config
def
parse_image
(
self
,
image_url
:
Optional
[
str
],
uuid
:
Optional
[
str
]
=
None
...
...
@@ -945,6 +959,12 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
image_embeds
:
Union
[
str
,
dict
[
str
,
str
],
None
],
uuid
:
Optional
[
str
]
=
None
,
)
->
None
:
mm_config
=
self
.
model_config
.
get_multimodal_config
()
if
not
mm_config
.
enable_mm_embeds
:
raise
ValueError
(
"You must set `--enable-mm-embeds` to input `image_embeds`"
)
future
:
asyncio
.
Future
[
Union
[
str
,
dict
[
str
,
str
],
None
]]
=
(
asyncio
.
Future
()
)
...
...
vllm/entrypoints/renderer.py
View file @
0da696a7
...
...
@@ -135,14 +135,17 @@ class BaseRenderer(ABC):
"""
raise
NotImplementedError
@
classmethod
def
load_prompt_embeds
(
cls
,
self
,
prompt_embeds
:
Union
[
bytes
,
list
[
bytes
]],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
0
)]]
=
None
,
cache_salt
:
Optional
[
str
]
=
None
,
)
->
list
[
EngineEmbedsPrompt
]:
"""Load and validate base64-encoded embeddings into prompt objects."""
if
not
self
.
model_config
.
enable_prompt_embeds
:
raise
ValueError
(
"You must set `--enable-prompt-embeds` to input `prompt_embeds`."
)
def
_load_and_validate_embed
(
embed
:
bytes
)
->
EngineEmbedsPrompt
:
tensor
=
torch
.
load
(
...
...
vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=gfx928_120cu_nn.json
0 → 100644
View file @
0da696a7
{
"triton_version"
:
"3.1.0"
,
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"48"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"96"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
5
,
"num_ldmatrixes"
:
1
},
"1536"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"3072"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
}
}
vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=gfx928_120cu_nn.json
0 → 100644
View file @
0da696a7
{
"triton_version"
:
"3.1.0"
,
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"48"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"96"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"1536"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
3
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"3072"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
1
}
}
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
0da696a7
...
...
@@ -1363,14 +1363,14 @@ def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
token_expert_indices
:
torch
.
Tensor
,
gating_output
:
torch
.
Tensor
,
renormalize
:
bool
)
->
tuple
[
torch
.
Tensor
,
...]:
if
envs
.
VLLM_USE_TOPK_RENORM
:
if
envs
.
VLLM_USE_TOPK_RENORM
and
renormalize
is
True
:
from
lightop
import
op
as
op
op
.
topk_softmax
(
topk_weights
,
topk_indices
,
token_expert_indices
,
gating_output
,
Tru
e
,
renormaliz
e
,
)
else
:
ops
.
topk_softmax
(
...
...
vllm/multimodal/processing.py
View file @
0da696a7
...
...
@@ -1296,6 +1296,16 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
[`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
"""
mm_items
=
self
.
data_parser
.
parse_mm_data
(
mm_data
)
mm_config
=
self
.
info
.
ctx
.
model_config
.
get_multimodal_config
()
if
not
mm_config
.
enable_mm_embeds
:
for
modality
,
items
in
mm_items
.
items
():
if
isinstance
(
items
,
(
EmbeddingItems
,
DictEmbeddingItems
)):
raise
ValueError
(
f
"You must set `--enable-mm-embeds` to input "
f
"`
{
modality
}
_embeds`"
)
for
modality
,
items
in
mm_items
.
items
():
self
.
validate_num_items
(
modality
,
len
(
items
))
...
...
vllm/platforms/rocm.py
View file @
0da696a7
...
...
@@ -16,12 +16,6 @@ from vllm.utils import cuda_device_count_stateless
from
.interface
import
DeviceCapability
,
Platform
,
PlatformEnum
,
_Backend
from
vllm.utils
import
SUPPORT_MOE_MARLIN_W16A16
if
SUPPORT_MOE_MARLIN_W16A16
:
os
.
environ
[
'VLLM_USE_MARLIN_W16A16_MOE'
]
=
'1'
os
.
environ
[
'MOE_NN'
]
=
'0'
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
,
VllmConfig
...
...
vllm/utils/__init__.py
View file @
0da696a7
...
...
@@ -86,10 +86,6 @@ if TYPE_CHECKING:
logger
=
init_logger
(
__name__
)
GPU_ARCH
=
torch
.
cuda
.
get_device_properties
(
"cuda"
).
gcnArchName
SUPPORT_MOE_MARLIN_W16A16
=
any
(
arch
in
GPU_ARCH
for
arch
in
[
"gfx936"
])
# This value is chosen to have a balance between ITL and TTFT. Note it is
# not optimized for throughput.
DEFAULT_MAX_NUM_BATCHED_TOKENS
=
2048
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment