Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6c6f7fe8
Unverified
Commit
6c6f7fe8
authored
Dec 27, 2024
by
Mengqing Cao
Committed by
GitHub
Dec 27, 2024
Browse files
[Platform] Move model arch check to platform (#11503)
Signed-off-by:
Mengqing Cao
<
cmq0113@163.com
>
parent
2339d59f
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
51 additions
and
37 deletions
+51
-37
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+1
-36
vllm/platforms/interface.py
vllm/platforms/interface.py
+12
-0
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+38
-1
No files found.
vllm/model_executor/models/registry.py
View file @
6c6f7fe8
...
@@ -187,31 +187,6 @@ _VLLM_MODELS = {
...
@@ -187,31 +187,6 @@ _VLLM_MODELS = {
**
_SPECULATIVE_DECODING_MODELS
,
**
_SPECULATIVE_DECODING_MODELS
,
}
}
# Models not supported by ROCm.
_ROCM_UNSUPPORTED_MODELS
:
List
[
str
]
=
[]
# Models partially supported by ROCm.
# Architecture -> Reason.
_ROCM_SWA_REASON
=
(
"Sliding window attention (SWA) is not yet supported in "
"Triton flash attention. For half-precision SWA support, "
"please use CK flash attention by setting "
"`VLLM_USE_TRITON_FLASH_ATTN=0`"
)
_ROCM_PARTIALLY_SUPPORTED_MODELS
:
Dict
[
str
,
str
]
=
{
"Qwen2ForCausalLM"
:
_ROCM_SWA_REASON
,
"MistralForCausalLM"
:
_ROCM_SWA_REASON
,
"MixtralForCausalLM"
:
_ROCM_SWA_REASON
,
"PaliGemmaForConditionalGeneration"
:
(
"ROCm flash attention does not yet "
"fully support 32-bit precision on PaliGemma"
),
"Phi3VForCausalLM"
:
(
"ROCm Triton flash attention may run into compilation errors due to "
"excessive use of shared memory. If this happens, disable Triton FA "
"by setting `VLLM_USE_TRITON_FLASH_ATTN=0`"
)
}
@
dataclass
(
frozen
=
True
)
@
dataclass
(
frozen
=
True
)
class
_ModelInfo
:
class
_ModelInfo
:
...
@@ -297,17 +272,7 @@ def _try_load_model_cls(
...
@@ -297,17 +272,7 @@ def _try_load_model_cls(
model_arch
:
str
,
model_arch
:
str
,
model
:
_BaseRegisteredModel
,
model
:
_BaseRegisteredModel
,
)
->
Optional
[
Type
[
nn
.
Module
]]:
)
->
Optional
[
Type
[
nn
.
Module
]]:
if
current_platform
.
is_rocm
():
current_platform
.
verify_model_arch
(
model_arch
)
if
model_arch
in
_ROCM_UNSUPPORTED_MODELS
:
raise
ValueError
(
f
"Model architecture '
{
model_arch
}
' is not "
"supported by ROCm for now."
)
if
model_arch
in
_ROCM_PARTIALLY_SUPPORTED_MODELS
:
msg
=
_ROCM_PARTIALLY_SUPPORTED_MODELS
[
model_arch
]
logger
.
warning
(
"Model architecture '%s' is partially "
"supported by ROCm: %s"
,
model_arch
,
msg
)
try
:
try
:
return
model
.
load_model_cls
()
return
model
.
load_model_cls
()
except
Exception
:
except
Exception
:
...
...
vllm/platforms/interface.py
View file @
6c6f7fe8
...
@@ -199,6 +199,18 @@ class Platform:
...
@@ -199,6 +199,18 @@ class Platform:
"""
"""
pass
pass
@
classmethod
def
verify_model_arch
(
cls
,
model_arch
:
str
)
->
None
:
"""
Verify whether the current platform supports the specified model
architecture.
- This will raise an Error or Warning based on the model support on
the current platform.
- By default all models are considered supported.
"""
pass
@
classmethod
@
classmethod
def
verify_quantization
(
cls
,
quant
:
str
)
->
None
:
def
verify_quantization
(
cls
,
quant
:
str
)
->
None
:
"""
"""
...
...
vllm/platforms/rocm.py
View file @
6c6f7fe8
import
os
import
os
from
functools
import
lru_cache
from
functools
import
lru_cache
from
typing
import
TYPE_CHECKING
,
Optional
from
typing
import
TYPE_CHECKING
,
Dict
,
List
,
Optional
import
torch
import
torch
...
@@ -33,6 +33,31 @@ if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
...
@@ -33,6 +33,31 @@ if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
" `spawn` instead."
)
" `spawn` instead."
)
os
.
environ
[
"VLLM_WORKER_MULTIPROC_METHOD"
]
=
"spawn"
os
.
environ
[
"VLLM_WORKER_MULTIPROC_METHOD"
]
=
"spawn"
# Models not supported by ROCm.
_ROCM_UNSUPPORTED_MODELS
:
List
[
str
]
=
[]
# Models partially supported by ROCm.
# Architecture -> Reason.
_ROCM_SWA_REASON
=
(
"Sliding window attention (SWA) is not yet supported in "
"Triton flash attention. For half-precision SWA support, "
"please use CK flash attention by setting "
"`VLLM_USE_TRITON_FLASH_ATTN=0`"
)
_ROCM_PARTIALLY_SUPPORTED_MODELS
:
Dict
[
str
,
str
]
=
{
"Qwen2ForCausalLM"
:
_ROCM_SWA_REASON
,
"MistralForCausalLM"
:
_ROCM_SWA_REASON
,
"MixtralForCausalLM"
:
_ROCM_SWA_REASON
,
"PaliGemmaForConditionalGeneration"
:
(
"ROCm flash attention does not yet "
"fully support 32-bit precision on PaliGemma"
),
"Phi3VForCausalLM"
:
(
"ROCm Triton flash attention may run into compilation errors due to "
"excessive use of shared memory. If this happens, disable Triton FA "
"by setting `VLLM_USE_TRITON_FLASH_ATTN=0`"
)
}
class
RocmPlatform
(
Platform
):
class
RocmPlatform
(
Platform
):
_enum
=
PlatformEnum
.
ROCM
_enum
=
PlatformEnum
.
ROCM
...
@@ -102,6 +127,18 @@ class RocmPlatform(Platform):
...
@@ -102,6 +127,18 @@ class RocmPlatform(Platform):
else
:
else
:
parallel_config
.
worker_cls
=
"vllm.worker.worker.Worker"
parallel_config
.
worker_cls
=
"vllm.worker.worker.Worker"
@
classmethod
def
verify_model_arch
(
cls
,
model_arch
:
str
)
->
None
:
if
model_arch
in
_ROCM_UNSUPPORTED_MODELS
:
raise
ValueError
(
f
"Model architecture '
{
model_arch
}
' is not "
"supported by ROCm for now."
)
if
model_arch
in
_ROCM_PARTIALLY_SUPPORTED_MODELS
:
msg
=
_ROCM_PARTIALLY_SUPPORTED_MODELS
[
model_arch
]
logger
.
warning
(
"Model architecture '%s' is partially "
"supported by ROCm: %s"
,
model_arch
,
msg
)
@
classmethod
@
classmethod
def
verify_quantization
(
cls
,
quant
:
str
)
->
None
:
def
verify_quantization
(
cls
,
quant
:
str
)
->
None
:
super
().
verify_quantization
(
quant
)
super
().
verify_quantization
(
quant
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment