Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
TransformerEngine
Commits
8d5cd8c6
Commit
8d5cd8c6
authored
Oct 15, 2025
by
yuguo
Browse files
[DCU] fix compile issues
parent
6cd2b2dd
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
18 additions
and
5 deletions
+18
-5
transformer_engine/common/CMakeLists.txt
transformer_engine/common/CMakeLists.txt
+0
-2
transformer_engine/common/dropout/dropout.cu
transformer_engine/common/dropout/dropout.cu
+4
-0
transformer_engine/common/transpose/quantize_transpose_vector_blockwise.cu
...e/common/transpose/quantize_transpose_vector_blockwise.cu
+14
-3
No files found.
transformer_engine/common/CMakeLists.txt
View file @
8d5cd8c6
...
@@ -206,9 +206,7 @@ else()
...
@@ -206,9 +206,7 @@ else()
dropout/dropout.cu
dropout/dropout.cu
activation/relu.cu
activation/relu.cu
activation/swiglu.cu
activation/swiglu.cu
gemm/cublaslt_gemm.cu
gemm/hipblas_gemm.cu
gemm/hipblas_gemm.cu
gemm/cutlass_grouped_gemm.cu
normalization/common.cpp
normalization/common.cpp
normalization/layernorm/ln_api.cpp
normalization/layernorm/ln_api.cpp
normalization/layernorm/ln_bwd_semi_cuda_kernel.cu
normalization/layernorm/ln_bwd_semi_cuda_kernel.cu
...
...
transformer_engine/common/dropout/dropout.cu
View file @
8d5cd8c6
...
@@ -43,7 +43,11 @@ __device__ __forceinline__ uint32_t bytewise_less_than(uint32_t a, uint32_t b) {
...
@@ -43,7 +43,11 @@ __device__ __forceinline__ uint32_t bytewise_less_than(uint32_t a, uint32_t b) {
// Bitwise logical op to get answer in MSBs
// Bitwise logical op to get answer in MSBs
// Equivalent logic: result = (a == b) ? !result : b
// Equivalent logic: result = (a == b) ? !result : b
#ifdef __HIP_PLATFORM_AMD__
result
=
(
a
==
b
)
?
!
result
:
b
;
#else
asm
(
"lop3.b32 %0, %1, %2, %3, 0x4D;
\n\t
"
:
"=r"
(
result
)
:
"r"
(
a
),
"r"
(
b
),
"r"
(
result
));
asm
(
"lop3.b32 %0, %1, %2, %3, 0x4D;
\n\t
"
:
"=r"
(
result
)
:
"r"
(
a
),
"r"
(
b
),
"r"
(
result
));
#endif
// Mask out everything except MSBs and return
// Mask out everything except MSBs and return
result
&=
0x80808080
;
result
&=
0x80808080
;
...
...
transformer_engine/common/transpose/quantize_transpose_vector_blockwise.cu
View file @
8d5cd8c6
...
@@ -579,7 +579,10 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
...
@@ -579,7 +579,10 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
const
size_t
num_rows
,
const
size_t
scale_stride_x
,
const
size_t
scale_stride_y
,
const
size_t
num_rows
,
const
size_t
scale_stride_x
,
const
size_t
scale_stride_y
,
const
size_t
scale_t_stride_x
,
const
size_t
scale_t_stride_y
,
const
float
epsilon
,
const
size_t
scale_t_stride_x
,
const
size_t
scale_t_stride_y
,
const
float
epsilon
,
FP8BlockwiseRowwiseOption
rowwise_option
,
FP8BlockwiseColumnwiseOption
columnwise_option
,
FP8BlockwiseRowwiseOption
rowwise_option
,
FP8BlockwiseColumnwiseOption
columnwise_option
,
const
bool
pow_2_scaling
)
{
const
bool
pow_2_scaling
,
const
float
*
noop_ptr
)
{
if
(
noop_ptr
!=
nullptr
&&
noop_ptr
[
0
]
==
1.0
f
)
{
return
;
}
bool
return_rowwise
=
rowwise_option
!=
FP8BlockwiseRowwiseOption
::
NONE
;
bool
return_rowwise
=
rowwise_option
!=
FP8BlockwiseRowwiseOption
::
NONE
;
bool
return_columnwise_gemm_ready
=
bool
return_columnwise_gemm_ready
=
columnwise_option
==
FP8BlockwiseColumnwiseOption
::
COLUMNWISE_GEMM_READY
;
columnwise_option
==
FP8BlockwiseColumnwiseOption
::
COLUMNWISE_GEMM_READY
;
...
@@ -1000,7 +1003,10 @@ __global__ void __launch_bounds__(kThreadsPerBlock_Rowwise)
...
@@ -1000,7 +1003,10 @@ __global__ void __launch_bounds__(kThreadsPerBlock_Rowwise)
const
size_t
scale_stride_x
,
const
size_t
scale_stride_x
,
const
size_t
scale_stride_y
,
const
float
epsilon
,
const
size_t
scale_stride_y
,
const
float
epsilon
,
FP8BlockwiseRowwiseOption
rowwise_option
,
FP8BlockwiseRowwiseOption
rowwise_option
,
const
bool
pow_2_scaling
)
{
const
bool
pow_2_scaling
,
const
float
*
noop_ptr
)
{
if
(
noop_ptr
!=
nullptr
&&
noop_ptr
[
0
]
==
1.0
f
)
{
return
;
}
bool
return_rowwise
=
rowwise_option
!=
FP8BlockwiseRowwiseOption
::
NONE
;
bool
return_rowwise
=
rowwise_option
!=
FP8BlockwiseRowwiseOption
::
NONE
;
using
SMemVec
=
Vec
<
IType
,
kNVecSMem_Rowwise
>
;
using
SMemVec
=
Vec
<
IType
,
kNVecSMem_Rowwise
>
;
...
@@ -1183,7 +1189,10 @@ __global__ void __launch_bounds__(kThreadsPerBlock_Colwise)
...
@@ -1183,7 +1189,10 @@ __global__ void __launch_bounds__(kThreadsPerBlock_Colwise)
const
IType
*
const
input
,
OType
*
const
output_t
,
CType
*
const
tile_scales_inv_t
,
const
IType
*
const
input
,
OType
*
const
output_t
,
CType
*
const
tile_scales_inv_t
,
const
size_t
row_length
,
const
size_t
num_rows
,
const
size_t
scale_t_stride_x
,
const
size_t
row_length
,
const
size_t
num_rows
,
const
size_t
scale_t_stride_x
,
const
size_t
scale_t_stride_y
,
const
float
epsilon
,
const
size_t
scale_t_stride_y
,
const
float
epsilon
,
FP8BlockwiseColumnwiseOption
columnwise_option
,
const
bool
pow_2_scaling
)
{
FP8BlockwiseColumnwiseOption
columnwise_option
,
const
bool
pow_2_scaling
,
const
float
*
noop_ptr
)
{
if
(
noop_ptr
!=
nullptr
&&
noop_ptr
[
0
]
==
1.0
f
)
{
return
;
}
bool
return_columnwise_gemm_ready
=
bool
return_columnwise_gemm_ready
=
columnwise_option
==
FP8BlockwiseColumnwiseOption
::
COLUMNWISE_GEMM_READY
;
columnwise_option
==
FP8BlockwiseColumnwiseOption
::
COLUMNWISE_GEMM_READY
;
bool
return_columnwise_compact
=
bool
return_columnwise_compact
=
...
@@ -1539,6 +1548,8 @@ void quantize_transpose_vector_blockwise(const SimpleTensor& input, SimpleTensor
...
@@ -1539,6 +1548,8 @@ void quantize_transpose_vector_blockwise(const SimpleTensor& input, SimpleTensor
const
size_t
block_len
=
blockwise_fp8_block_len
();
const
size_t
block_len
=
blockwise_fp8_block_len
();
const
size_t
num_blocks_x
=
DIVUP
(
row_length
,
(
size_t
)
block_len
);
const
size_t
num_blocks_x
=
DIVUP
(
row_length
,
(
size_t
)
block_len
);
const
size_t
num_blocks_y
=
DIVUP
(
num_rows
,
(
size_t
)
block_len
);
const
size_t
num_blocks_y
=
DIVUP
(
num_rows
,
(
size_t
)
block_len
);
const
float
*
noop_ptr
=
reinterpret_cast
<
const
float
*>
(
noop_tensor
.
dptr
);
#else
#else
const
size_t
num_blocks_x
=
DIVUP
(
row_length
,
(
size_t
)
kTileDim
);
const
size_t
num_blocks_x
=
DIVUP
(
row_length
,
(
size_t
)
kTileDim
);
const
size_t
num_blocks_y
=
DIVUP
(
num_rows
,
(
size_t
)
kTileDim
);
const
size_t
num_blocks_y
=
DIVUP
(
num_rows
,
(
size_t
)
kTileDim
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment