Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
343a10fa
Commit
343a10fa
authored
Apr 03, 2025
by
zhuwenwen
Browse files
[update] MLA stage=2 并启动pipelinev2, 修复pipeline subop 操作类型不一致
parent
2c35b6cd
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
18 additions
and
14 deletions
+18
-14
examples/mla/triton_decode_attention.py
examples/mla/triton_decode_attention.py
+9
-7
vllm/attention/ops/triton_decode_attention.py
vllm/attention/ops/triton_decode_attention.py
+9
-7
No files found.
examples/mla/triton_decode_attention.py
View file @
343a10fa
...
@@ -37,7 +37,9 @@ import triton.language as tl
...
@@ -37,7 +37,9 @@ import triton.language as tl
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
is_hip_
=
current_platform
.
is_rocm
()
is_hip_
=
current_platform
.
is_rocm
()
os
.
environ
[
"TRITON_HIP_USE_NEW_STREAM_PIPELINE"
]
=
f
"0"
os
.
environ
[
"TRITON_HIP_USE_NEW_STREAM_PIPELINE"
]
=
f
"1"
os
.
environ
[
"TRITON_ENABLE_GLOBAL_TO_LOCAL_AND_NUMSTAGE2"
]
=
"0"
os
.
environ
[
"TRITON_DEFAULT_ENABLE_NUM_VGPRS512"
]
=
"1"
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -751,12 +753,12 @@ def decode_attention_v1(
...
@@ -751,12 +753,12 @@ def decode_attention_v1(
@
triton
.
autotune
(
@
triton
.
autotune
(
configs
=
[
configs
=
[
triton
.
Config
({
"BLOCK_N"
:
16
,
"BLOCK_DIM"
:
64
},
num_warps
=
2
,
num_stages
=
1
),
triton
.
Config
({
"BLOCK_N"
:
16
,
"BLOCK_DIM"
:
64
},
num_warps
=
2
,
num_stages
=
2
),
triton
.
Config
({
"BLOCK_N"
:
16
,
"BLOCK_DIM"
:
64
},
num_warps
=
4
,
num_stages
=
1
),
triton
.
Config
({
"BLOCK_N"
:
16
,
"BLOCK_DIM"
:
64
},
num_warps
=
4
,
num_stages
=
2
),
triton
.
Config
({
"BLOCK_N"
:
32
,
"BLOCK_DIM"
:
64
},
num_warps
=
2
,
num_stages
=
1
),
triton
.
Config
({
"BLOCK_N"
:
32
,
"BLOCK_DIM"
:
64
},
num_warps
=
2
,
num_stages
=
2
),
triton
.
Config
({
"BLOCK_N"
:
32
,
"BLOCK_DIM"
:
64
},
num_warps
=
4
,
num_stages
=
1
),
triton
.
Config
({
"BLOCK_N"
:
32
,
"BLOCK_DIM"
:
64
},
num_warps
=
4
,
num_stages
=
2
),
triton
.
Config
({
"BLOCK_N"
:
64
,
"BLOCK_DIM"
:
32
},
num_warps
=
2
,
num_stages
=
1
),
triton
.
Config
({
"BLOCK_N"
:
64
,
"BLOCK_DIM"
:
32
},
num_warps
=
2
,
num_stages
=
2
),
triton
.
Config
({
"BLOCK_N"
:
64
,
"BLOCK_DIM"
:
32
},
num_warps
=
4
,
num_stages
=
1
),
triton
.
Config
({
"BLOCK_N"
:
64
,
"BLOCK_DIM"
:
32
},
num_warps
=
4
,
num_stages
=
2
),
triton
.
Config
({
"BLOCK_N"
:
128
,
"BLOCK_DIM"
:
32
},
num_warps
=
2
,
num_stages
=
1
),
triton
.
Config
({
"BLOCK_N"
:
128
,
"BLOCK_DIM"
:
32
},
num_warps
=
2
,
num_stages
=
1
),
triton
.
Config
({
"BLOCK_N"
:
128
,
"BLOCK_DIM"
:
32
},
num_warps
=
4
,
num_stages
=
1
),
triton
.
Config
({
"BLOCK_N"
:
128
,
"BLOCK_DIM"
:
32
},
num_warps
=
4
,
num_stages
=
1
),
triton
.
Config
({
"BLOCK_N"
:
256
,
"BLOCK_DIM"
:
32
},
num_warps
=
2
,
num_stages
=
1
),
triton
.
Config
({
"BLOCK_N"
:
256
,
"BLOCK_DIM"
:
32
},
num_warps
=
2
,
num_stages
=
1
),
...
...
vllm/attention/ops/triton_decode_attention.py
View file @
343a10fa
...
@@ -39,7 +39,9 @@ from vllm import envs
...
@@ -39,7 +39,9 @@ from vllm import envs
# from ..backends.triton_config import KERNLE_KINDS
# from ..backends.triton_config import KERNLE_KINDS
is_hip_
=
current_platform
.
is_rocm
()
is_hip_
=
current_platform
.
is_rocm
()
os
.
environ
[
"TRITON_HIP_USE_NEW_STREAM_PIPELINE"
]
=
f
"0"
os
.
environ
[
"TRITON_HIP_USE_NEW_STREAM_PIPELINE"
]
=
f
"1"
os
.
environ
[
"TRITON_ENABLE_GLOBAL_TO_LOCAL_AND_NUMSTAGE2"
]
=
"0"
os
.
environ
[
"TRITON_DEFAULT_ENABLE_NUM_VGPRS512"
]
=
"1"
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -1062,12 +1064,12 @@ def decode_attention_v1(
...
@@ -1062,12 +1064,12 @@ def decode_attention_v1(
# @triton.autotune(
# @triton.autotune(
# configs=[
# configs=[
# triton.Config({"BLOCK_N": 16, "BLOCK_DIM":64}, num_warps=2, num_stages=
1
),
# triton.Config({"BLOCK_N": 16, "BLOCK_DIM":64}, num_warps=2, num_stages=
2
),
# triton.Config({"BLOCK_N": 16, "BLOCK_DIM":64}, num_warps=4, num_stages=
1
),
# triton.Config({"BLOCK_N": 16, "BLOCK_DIM":64}, num_warps=4, num_stages=
2
),
# triton.Config({"BLOCK_N": 32, "BLOCK_DIM":64}, num_warps=2, num_stages=
1
),
# triton.Config({"BLOCK_N": 32, "BLOCK_DIM":64}, num_warps=2, num_stages=
2
),
# triton.Config({"BLOCK_N": 32, "BLOCK_DIM":64}, num_warps=4, num_stages=
1
),
# triton.Config({"BLOCK_N": 32, "BLOCK_DIM":64}, num_warps=4, num_stages=
2
),
# triton.Config({"BLOCK_N": 64, "BLOCK_DIM":32}, num_warps=2, num_stages=
1
),
# triton.Config({"BLOCK_N": 64, "BLOCK_DIM":32}, num_warps=2, num_stages=
2
),
# triton.Config({"BLOCK_N": 64, "BLOCK_DIM":32}, num_warps=4, num_stages=
1
),
# triton.Config({"BLOCK_N": 64, "BLOCK_DIM":32}, num_warps=4, num_stages=
2
),
# triton.Config({"BLOCK_N": 128, "BLOCK_DIM":32}, num_warps=2, num_stages=1),
# triton.Config({"BLOCK_N": 128, "BLOCK_DIM":32}, num_warps=2, num_stages=1),
# triton.Config({"BLOCK_N": 128, "BLOCK_DIM":32}, num_warps=4, num_stages=1),
# triton.Config({"BLOCK_N": 128, "BLOCK_DIM":32}, num_warps=4, num_stages=1),
# triton.Config({"BLOCK_N": 256, "BLOCK_DIM":32}, num_warps=2, num_stages=1),
# triton.Config({"BLOCK_N": 256, "BLOCK_DIM":32}, num_warps=2, num_stages=1),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment