Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
FlashMLA
Commits
1dea361d
Commit
1dea361d
authored
Apr 07, 2026
by
zhanghj2
Browse files
fix buffer load lds data hazard
parent
8a69b46c
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
22 additions
and
0 deletions
+22
-0
csrc/extension/flash_fwd_mla_kernel_fp8.h
csrc/extension/flash_fwd_mla_kernel_fp8.h
+1
-0
csrc/extension/utils.h
csrc/extension/utils.h
+13
-0
csrc/gfx93/decode/sparse_fp8/splitkv_mla.cuh
csrc/gfx93/decode/sparse_fp8/splitkv_mla.cuh
+2
-0
csrc/utils.h
csrc/utils.h
+6
-0
No files found.
csrc/extension/flash_fwd_mla_kernel_fp8.h
View file @
1dea361d
...
...
@@ -710,6 +710,7 @@ __forceinline__ __device__ void compute_attn_1rowblock_splitkv_mla_fp8_gfx938(co
+
offset_k
*
3
*
bytes_per_block
;
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
csrc/extension/utils.h
View file @
1dea361d
...
...
@@ -713,6 +713,7 @@ lds_direct_copy_qkvfp8_pe(
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -779,6 +780,7 @@ lds_direct_copy_qkvfp8(
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -845,6 +847,7 @@ lds_direct_copy_qkvfp8(
#if defined(__gfx938__)
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -932,6 +935,7 @@ lds_direct_copy_fp8(
#if defined(__gfx936__) || defined(__gfx938__)
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -1003,6 +1007,7 @@ lds_direct_copy_tp1(
// }
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -1183,6 +1188,7 @@ lds_direct_copy_sparse_k(
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -1251,6 +1257,7 @@ lds_direct_copy(
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -1314,6 +1321,7 @@ lds_direct_copy(
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -1403,6 +1411,7 @@ lds_direct_copy_for_prefill_sparse_mla(
index_offset
[
1
]
=
offset_v
;
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 , idxen offen offset:0, lds
\n
"
::
"v"
(
index_offset
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -2296,6 +2305,7 @@ lds_direct_copy_qkvfp8_q_tp1(
#if defined(__gfx938__)
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -2354,6 +2364,7 @@ lds_direct_copy_qkvfp8_q_tp4(
#if defined(__gfx938__)
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -2435,6 +2446,7 @@ lds_direct_copy_qkvfp8_tp1(
#if defined(__gfx938__)
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -2782,6 +2794,7 @@ lds_direct_copy_qkvfp8_zero_lds(
#if defined(__gfx938__)
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
csrc/gfx93/decode/sparse_fp8/splitkv_mla.cuh
View file @
1dea361d
...
...
@@ -120,6 +120,7 @@ __device__ void KernelTemplate<MODEL_TYPE, NUM_HEADS>::compute_attn_1rowblock_sp
+
offset_k
*
3
*
bytes_per_block
;
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -137,6 +138,7 @@ __device__ void KernelTemplate<MODEL_TYPE, NUM_HEADS>::compute_attn_1rowblock_sp
+
offset_k
*
2
*
bytes_per_block
;
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
csrc/utils.h
View file @
1dea361d
...
...
@@ -458,6 +458,7 @@ lds_direct_copy(
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -521,6 +522,7 @@ lds_direct_copy(
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -591,6 +593,7 @@ lds_direct_copy_for_prefill_sparse_mla(
index_offset
[
1
]
=
offset_v
;
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 , idxen offen offset:0, lds
\n
"
::
"v"
(
index_offset
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -818,6 +821,7 @@ lds_direct_copy_qkvfp8(
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -884,6 +888,7 @@ lds_direct_copy_qkvfp8(
#if defined(__gfx938__)
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
@@ -1041,6 +1046,7 @@ lds_direct_copy_fp8(
#if defined(__gfx936__) || defined(__gfx938__)
asm
volatile
(
"s_mov_b32 m0, %1
\n\t
"
"s_nop 0
\n\t
"
"buffer_load_dwordx4 %0, %2, %3 ,offen offset:0, lds
\n
"
::
"v"
(
offset_v
),
"s"
(
ldsAddrPerWave
),
"s"
(
global_addr
),
"s"
(
offset_s
)
:
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment