Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cffe15ef
Commit
cffe15ef
authored
Dec 13, 2025
by
zhuwenwen
Browse files
update deps,fix import and optional error, remove tc
parent
c004bf6e
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
44 additions
and
83 deletions
+44
-83
requirements/rocm.txt
requirements/rocm.txt
+2
-2
vllm/_custom_ops.py
vllm/_custom_ops.py
+9
-13
vllm/attention/ops/paged_attn.py
vllm/attention/ops/paged_attn.py
+1
-2
vllm/config/cache.py
vllm/config/cache.py
+1
-0
vllm/config/model.py
vllm/config/model.py
+1
-2
vllm/model_executor/layers/vocab_parallel_embedding.py
vllm/model_executor/layers/vocab_parallel_embedding.py
+26
-54
vllm/model_executor/parameter.py
vllm/model_executor/parameter.py
+4
-4
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+0
-6
No files found.
requirements/rocm.txt
View file @
cffe15ef
...
...
@@ -25,9 +25,9 @@ quart
fastrlock==0.8.3
cupy==12.3.0
torch == 2.
5
.1
torch == 2.
7
.1
triton == 3.1
flash_attn == 2.6.1
flash_mla == 1.0.0
lightop == 0.6.0
lmslim == 0.3.1
#
lmslim == 0.3.1
vllm/_custom_ops.py
View file @
cffe15ef
...
...
@@ -10,7 +10,7 @@ from vllm.logger import init_logger
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
ScalarType
from
vllm.utils
import
direct_register_custom_op
from
vllm.utils
.torch_utils
import
direct_register_custom_op
try
:
from
lmslim
import
quant_ops
...
...
@@ -472,10 +472,6 @@ def GetAWQShareWorkspaceSize()->int:
def
GetAWQShareWorkspace
()
->
torch
.
Tensor
:
return
quant_ops
.
GetAWQShareWorkspace
()
def
awq_dequantize
(
qweight
:
torch
.
Tensor
,
scales
:
torch
.
Tensor
,
zeros
:
torch
.
Tensor
,
split_k_iters
:
int
,
thx
:
int
,
thy
:
int
)
->
torch
.
Tensor
:
def
awq_dequantize
(
qweight
:
torch
.
Tensor
,
scales
:
torch
.
Tensor
,
...
...
@@ -928,7 +924,7 @@ def rocblas_scaled_mm(a: torch.Tensor,
scale_a
:
torch
.
Tensor
,
scale_b
:
torch
.
Tensor
,
out_dtype
:
torch
.
dtype
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
bias
:
torch
.
Tensor
|
None
=
None
)
->
torch
.
Tensor
:
# cutlass_compatible_b = b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0
# if current_platform.is_rocm() or not cutlass_compatible_b:
...
...
@@ -947,7 +943,7 @@ def blaslt_scaled_mm(a: torch.Tensor,
scale_a
:
torch
.
Tensor
,
scale_b
:
torch
.
Tensor
,
out_dtype
:
torch
.
dtype
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
bias
:
torch
.
Tensor
|
None
=
None
)
->
torch
.
Tensor
:
m
=
a
.
shape
[
0
]
n
=
b
.
shape
[
0
]
k
=
a
.
shape
[
1
]
...
...
@@ -961,8 +957,8 @@ def triton_scaled_mm(a: torch.Tensor,
scale_a
:
torch
.
Tensor
,
scale_b
:
torch
.
Tensor
,
out_dtype
:
torch
.
dtype
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
best_config
:
Optional
[
list
]
=
None
)
->
torch
.
Tensor
:
bias
:
torch
.
Tensor
|
None
=
None
,
best_config
:
list
|
None
=
None
)
->
torch
.
Tensor
:
return
quant_ops
.
triton_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
,
best_config
)
...
...
@@ -974,8 +970,8 @@ def triton_int8_gemm_helper(m: int,
use_bias
:
bool
,
out_dtype
:
type
[
torch
.
dtype
]
=
torch
.
float16
,
device
:
str
=
"cuda:0"
,
best_config
:
Optional
[
list
]
=
None
,
repeat
:
Optional
[
int
]
=
2
):
best_config
:
list
|
None
=
None
,
repeat
:
int
|
None
=
2
):
return
quant_tools
.
triton_int8_gemm_helper
(
m
,
n
,
k
,
per_token_act_quant
,
per_out_channel_weight_quant
,
use_bias
,
out_dtype
,
device
,
best_config
,
repeat
)
def
triton_blockint8_gemm_helper
(
m
:
int
,
...
...
@@ -985,8 +981,8 @@ def triton_blockint8_gemm_helper(m: int,
use_bias
:
bool
=
False
,
out_dtype
:
type
[
torch
.
dtype
]
=
torch
.
bfloat16
,
device
:
str
=
"cuda:0"
,
best_config
:
Optional
[
dict
]
=
None
,
repeat
:
Optional
[
int
]
=
2
):
best_config
:
dict
|
None
=
None
,
repeat
:
int
|
None
=
2
):
return
quant_tools
.
triton_blockint8_gemm_helper
(
m
,
n
,
k
,
block_size
,
use_bias
,
out_dtype
,
device
,
best_config
,
repeat
)
...
...
vllm/attention/ops/paged_attn.py
View file @
cffe15ef
...
...
@@ -7,7 +7,6 @@ import torch
from
vllm.platforms
import
current_platform
from
vllm
import
envs
from
vllm.utils
import
SUPPORT_TC
from
vllm.triton_utils
import
HAS_TRITON
if
current_platform
.
is_cuda_alike
():
...
...
@@ -18,7 +17,7 @@ elif current_platform.is_xpu():
if
HAS_TRITON
:
from
vllm.attention.ops.prefix_prefill
import
context_attention_fwd
use_tc
=
envs
.
VLLM_USE_OPT_OP
and
envs
.
VLLM_USE_TC_PAGED_ATTN
and
SUPPORT_TC
use_tc
=
envs
.
VLLM_USE_OPT_OP
and
envs
.
VLLM_USE_TC_PAGED_ATTN
class
PagedAttention
:
...
...
vllm/config/cache.py
View file @
cffe15ef
...
...
@@ -11,6 +11,7 @@ from vllm.config.utils import config
from
vllm.logger
import
init_logger
from
vllm.utils.mem_constants
import
GiB_bytes
from
vllm.utils.mem_utils
import
get_cpu_memory
from
vllm
import
envs
if
TYPE_CHECKING
:
from
vllm.config.parallel
import
ParallelConfig
...
...
vllm/config/model.py
View file @
cffe15ef
...
...
@@ -49,7 +49,6 @@ from vllm.transformers_utils.utils import (
)
from
vllm.utils.import_utils
import
LazyLoader
from
vllm.utils.torch_utils
import
common_broadcastable_dtype
from
vllm.utils
import
SUPPORT_TC
if
TYPE_CHECKING
:
from
transformers
import
PretrainedConfig
...
...
@@ -1674,7 +1673,7 @@ class ModelConfig:
@
property
def
use_mla
(
self
)
->
bool
:
return
self
.
is_deepseek_mla
and
not
envs
.
VLLM_MLA_DISABLE
and
SUPPORT_TC
return
self
.
is_deepseek_mla
and
not
envs
.
VLLM_MLA_DISABLE
@
property
def
is_matryoshka
(
self
)
->
bool
:
...
...
vllm/model_executor/layers/vocab_parallel_embedding.py
View file @
cffe15ef
...
...
@@ -29,7 +29,6 @@ from vllm.model_executor.parameter import BasevLLMParameter
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.utils
import
SUPPORT_TC
DEFAULT_VOCAB_PADDING_SIZE
=
64
...
...
@@ -185,59 +184,32 @@ class VocabParallelEmbeddingShardIndices:
assert
self
.
num_added_elements
<=
self
.
num_added_elements_padded
if
SUPPORT_TC
:
@
torch
.
compile
(
dynamic
=
True
,
backend
=
current_platform
.
simple_compile_backend
)
def
get_masked_input_and_mask
(
input_
:
torch
.
Tensor
,
org_vocab_start_index
:
int
,
org_vocab_end_index
:
int
,
num_org_vocab_padding
:
int
,
added_vocab_start_index
:
int
,
added_vocab_end_index
:
int
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# torch.compile will fuse all of the pointwise ops below
# into a single kernel, making it very fast
org_vocab_mask
=
(
input_
>=
org_vocab_start_index
)
&
(
input_
<
org_vocab_end_index
)
added_vocab_mask
=
(
input_
>=
added_vocab_start_index
)
&
(
input_
<
added_vocab_end_index
)
added_offset
=
(
added_vocab_start_index
-
(
org_vocab_end_index
-
org_vocab_start_index
)
-
num_org_vocab_padding
)
valid_offset
=
(
org_vocab_start_index
*
org_vocab_mask
)
+
(
added_offset
*
added_vocab_mask
)
vocab_mask
=
org_vocab_mask
|
added_vocab_mask
input_
=
vocab_mask
*
(
input_
-
valid_offset
)
return
input_
,
~
vocab_mask
else
:
def
get_masked_input_and_mask
(
input_
:
torch
.
Tensor
,
org_vocab_start_index
:
int
,
org_vocab_end_index
:
int
,
num_org_vocab_padding
:
int
,
added_vocab_start_index
:
int
,
added_vocab_end_index
:
int
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# torch.compile will fuse all of the pointwise ops below
# into a single kernel, making it very fast
org_vocab_mask
=
(
input_
>=
org_vocab_start_index
)
&
(
input_
<
org_vocab_end_index
)
added_vocab_mask
=
(
input_
>=
added_vocab_start_index
)
&
(
input_
<
added_vocab_end_index
)
added_offset
=
(
added_vocab_start_index
-
(
org_vocab_end_index
-
org_vocab_start_index
)
-
num_org_vocab_padding
)
valid_offset
=
(
org_vocab_start_index
*
org_vocab_mask
)
+
(
added_offset
*
added_vocab_mask
)
vocab_mask
=
org_vocab_mask
|
added_vocab_mask
input_
=
vocab_mask
*
(
input_
-
valid_offset
)
return
input_
,
~
vocab_mask
@
torch
.
compile
(
dynamic
=
True
,
backend
=
current_platform
.
simple_compile_backend
)
def
get_masked_input_and_mask
(
input_
:
torch
.
Tensor
,
org_vocab_start_index
:
int
,
org_vocab_end_index
:
int
,
num_org_vocab_padding
:
int
,
added_vocab_start_index
:
int
,
added_vocab_end_index
:
int
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# torch.compile will fuse all of the pointwise ops below
# into a single kernel, making it very fast
org_vocab_mask
=
(
input_
>=
org_vocab_start_index
)
&
(
input_
<
org_vocab_end_index
)
added_vocab_mask
=
(
input_
>=
added_vocab_start_index
)
&
(
input_
<
added_vocab_end_index
)
added_offset
=
(
added_vocab_start_index
-
(
org_vocab_end_index
-
org_vocab_start_index
)
-
num_org_vocab_padding
)
valid_offset
=
(
org_vocab_start_index
*
org_vocab_mask
)
+
(
added_offset
*
added_vocab_mask
)
vocab_mask
=
org_vocab_mask
|
added_vocab_mask
input_
=
vocab_mask
*
(
input_
-
valid_offset
)
return
input_
,
~
vocab_mask
@
CustomOp
.
register
(
"vocab_parallel_embedding"
)
class
VocabParallelEmbedding
(
CustomOp
):
...
...
vllm/model_executor/parameter.py
View file @
cffe15ef
...
...
@@ -98,10 +98,10 @@ class BasevLLMParameter(Parameter):
)
self
.
data
.
copy_
(
loaded_weight
)
def
load_column_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
,
is_quantization
:
Optional
[
bool
]
=
False
):
def
load_column_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
,
is_quantization
:
bool
|
None
=
False
):
self
.
_assert_and_load
(
loaded_weight
)
def
load_row_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
,
is_quantization
:
Optional
[
bool
]
=
False
):
def
load_row_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
,
is_quantization
:
bool
|
None
=
False
):
self
.
_assert_and_load
(
loaded_weight
)
def
load_merged_column_weight
(
self
,
loaded_weight
:
torch
.
Tensor
,
**
kwargs
):
...
...
@@ -147,7 +147,7 @@ class _ColumnvLLMParameter(BasevLLMParameter):
def
output_dim
(
self
):
return
self
.
_output_dim
def
load_column_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
,
is_quantization
:
Optional
[
bool
]
=
False
):
def
load_column_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
,
is_quantization
:
bool
|
None
=
False
):
if
not
envs
.
VLLM_USE_NN
or
len
(
self
.
data
.
shape
)
==
1
or
is_quantization
:
shard_size
=
self
.
data
.
shape
[
self
.
output_dim
]
else
:
...
...
@@ -240,7 +240,7 @@ class RowvLLMParameter(BasevLLMParameter):
def
input_dim
(
self
):
return
self
.
_input_dim
def
load_row_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
,
is_quantization
:
Optional
[
bool
]
=
False
):
def
load_row_parallel_weight
(
self
,
loaded_weight
:
torch
.
Tensor
,
is_quantization
:
bool
|
None
=
False
):
if
not
envs
.
VLLM_USE_NN
or
is_quantization
:
shard_size
=
self
.
data
.
shape
[
self
.
input_dim
]
else
:
...
...
vllm/platforms/rocm.py
View file @
cffe15ef
...
...
@@ -14,12 +14,6 @@ from vllm.utils.torch_utils import cuda_device_count_stateless
from
.interface
import
DeviceCapability
,
Platform
,
PlatformEnum
from
vllm.utils
import
SUPPORT_TC
if
not
SUPPORT_TC
:
os
.
environ
[
'VLLM_USE_V1'
]
=
'0'
os
.
environ
[
'VLLM_USE_FLASH_ATTN_PA'
]
=
'0'
os
.
environ
[
'VLLM_USE_FLASH_MLA'
]
=
'0'
if
TYPE_CHECKING
:
from
vllm.config
import
VllmConfig
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment