Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
c6c379ab
Unverified
Commit
c6c379ab
authored
Aug 18, 2025
by
Hubert Lu
Committed by
GitHub
Aug 18, 2025
Browse files
[AMD] Reorganize hip-related header files in sgl-kernel (#9320)
parent
c2fbf60f
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
18 additions
and
17 deletions
+18
-17
.github/workflows/pr-test-amd.yml
.github/workflows/pr-test-amd.yml
+1
-0
sgl-kernel/csrc/allreduce/mscclpp_allreduce.cuh
sgl-kernel/csrc/allreduce/mscclpp_allreduce.cuh
+1
-1
sgl-kernel/csrc/elementwise/activation.cu
sgl-kernel/csrc/elementwise/activation.cu
+1
-1
sgl-kernel/csrc/gemm/per_tensor_quant_fp8.cu
sgl-kernel/csrc/gemm/per_tensor_quant_fp8.cu
+2
-2
sgl-kernel/csrc/gemm/per_token_quant_fp8.cu
sgl-kernel/csrc/gemm/per_token_quant_fp8.cu
+2
-2
sgl-kernel/csrc/moe/moe_align_kernel.cu
sgl-kernel/csrc/moe/moe_align_kernel.cu
+0
-2
sgl-kernel/include/hip/hip_act_and_mul.cuh
sgl-kernel/include/hip/hip_act_and_mul.cuh
+0
-0
sgl-kernel/include/hip/hip_math_def.h
sgl-kernel/include/hip/hip_math_def.h
+1
-1
sgl-kernel/include/hip/hip_vec_dtypes.h
sgl-kernel/include/hip/hip_vec_dtypes.h
+0
-0
sgl-kernel/include/hip/impl/hip_vec_bf16_impl.h
sgl-kernel/include/hip/impl/hip_vec_bf16_impl.h
+0
-0
sgl-kernel/include/hip/impl/hip_vec_fp32_impl.h
sgl-kernel/include/hip/impl/hip_vec_fp32_impl.h
+0
-0
sgl-kernel/include/hip/impl/hip_vec_half_impl.h
sgl-kernel/include/hip/impl/hip_vec_half_impl.h
+0
-0
sgl-kernel/include/utils.h
sgl-kernel/include/utils.h
+6
-7
sgl-kernel/setup_rocm.py
sgl-kernel/setup_rocm.py
+4
-1
No files found.
.github/workflows/pr-test-amd.yml
View file @
c6c379ab
...
...
@@ -342,6 +342,7 @@ jobs:
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py
pr-test-amd-finish
:
if
:
always()
...
...
sgl-kernel/csrc/allreduce/mscclpp_allreduce.cuh
View file @
c6c379ab
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#pragma once
#if
def
ined(__HIP_PLATFORM_AMD__)
#ifdef
USE_ROCM
#include <hip/hip_fp16.h>
#else
#include <cuda_bf16.h>
...
...
sgl-kernel/csrc/elementwise/activation.cu
View file @
c6c379ab
...
...
@@ -25,7 +25,7 @@
#include "utils.h"
#else
#include "hip_act_and_mul.cuh"
#include "hip
/hip
_act_and_mul.cuh"
#endif
// Adapted from flashinfer activation
...
...
sgl-kernel/csrc/gemm/per_tensor_quant_fp8.cu
View file @
c6c379ab
...
...
@@ -69,7 +69,7 @@ __global__ void per_tensor_quant_fp8_kernel(
#pragma unroll
for
(
uint32_t
j
=
0
;
j
<
VEC_SIZE
;
++
j
)
{
float
val
=
fmax
(
fmin
(
static_cast
<
float
>
(
input_vec
[
j
])
*
scale_val
,
FP8_E4M3_MAX
),
-
FP8_E4M3_MAX
);
#if
n
def
USE_ROCM
#if
!
def
ined(
USE_ROCM
) || defined(HIP_FP8_TYPE_E4M3)
output_arr
[
j
]
=
static_cast
<
DST_DTYPE
>
(
val
);
#else
output_arr
[
j
]
=
c10
::
Float8_e4m3fnuz
(
...
...
@@ -83,7 +83,7 @@ __global__ void per_tensor_quant_fp8_kernel(
const
int32_t
remaining_start
=
num_vec_elems
*
VEC_SIZE
;
for
(
int32_t
idx
=
remaining_start
+
gid
;
idx
<
num_elements
;
idx
+=
grid_size
)
{
float
val
=
fmax
(
-
FP8_E4M3_MAX
,
fmin
(
static_cast
<
float
>
(
input
[
idx
])
*
scale_val
,
FP8_E4M3_MAX
));
#if
n
def
USE_ROCM
#if
!
def
ined(
USE_ROCM
) || defined(HIP_FP8_TYPE_E4M3)
output
[
idx
]
=
static_cast
<
DST_DTYPE
>
(
val
);
#else
output
[
idx
]
=
c10
::
Float8_e4m3fnuz
(
...
...
sgl-kernel/csrc/gemm/per_token_quant_fp8.cu
View file @
c6c379ab
...
...
@@ -67,7 +67,7 @@ __global__ void per_token_quant_fp8_kernel(
for
(
uint32_t
j
=
0
;
j
<
kVecSize
;
++
j
)
{
float
val
=
static_cast
<
float
>
(
input_vec
[
j
])
*
scale_inv
;
val
=
fmaxf
(
fminf
(
val
,
FP8_E4M3_MAX
),
-
FP8_E4M3_MAX
);
#if
n
def
USE_ROCM
#if
!
def
ined(
USE_ROCM
) || defined(HIP_FP8_TYPE_E4M3)
output_arr
[
j
]
=
static_cast
<
DST_DTYPE
>
(
val
);
#else
output_arr
[
j
]
=
c10
::
Float8_e4m3fnuz
(
...
...
@@ -143,7 +143,7 @@ __global__ void per_token_quant_fp8_small_batch_kernel(
#pragma unroll
for
(
uint32_t
j
=
0
;
j
<
kVecSize
;
++
j
)
{
float
val
=
fmaxf
(
fminf
(
static_cast
<
float
>
(
input_vec
[
j
])
*
scale_inv
,
FP8_E4M3_MAX
),
-
FP8_E4M3_MAX
);
#if
n
def
USE_ROCM
#if
!
def
ined(
USE_ROCM
) || defined(HIP_FP8_TYPE_E4M3)
output_arr
[
j
]
=
static_cast
<
DST_DTYPE
>
(
val
);
#else
output_arr
[
j
]
=
c10
::
Float8_e4m3fnuz
(
...
...
sgl-kernel/csrc/moe/moe_align_kernel.cu
View file @
c6c379ab
...
...
@@ -21,8 +21,6 @@ limitations under the License.
#include "utils.h"
#define WARP_SIZE 32
#define VEC_SIZE 4
using
Vec
=
int4
;
...
...
sgl-kernel/include/hip_act_and_mul.cuh
→
sgl-kernel/include/hip
/hip
_act_and_mul.cuh
View file @
c6c379ab
File moved
sgl-kernel/include/hip_math_def.h
→
sgl-kernel/include/hip
/hip
_math_def.h
View file @
c6c379ab
...
...
@@ -15,7 +15,7 @@ limitations under the License.
#pragma once
#if
def
ined(__HIP_PLATFORM_AMD__)
#ifdef
USE_ROCM
#include <hip/hip_bf16.h>
#include <hip/hip_common.h>
...
...
sgl-kernel/include/hip_vec_dtypes.h
→
sgl-kernel/include/hip
/hip
_vec_dtypes.h
View file @
c6c379ab
File moved
sgl-kernel/include/impl/hip_vec_bf16_impl.h
→
sgl-kernel/include/
hip/
impl/hip_vec_bf16_impl.h
View file @
c6c379ab
File moved
sgl-kernel/include/impl/hip_vec_fp32_impl.h
→
sgl-kernel/include/
hip/
impl/hip_vec_fp32_impl.h
View file @
c6c379ab
File moved
sgl-kernel/include/impl/hip_vec_half_impl.h
→
sgl-kernel/include/
hip/
impl/hip_vec_half_impl.h
View file @
c6c379ab
File moved
sgl-kernel/include/utils.h
View file @
c6c379ab
...
...
@@ -331,13 +331,15 @@ inline bool getEnvEnablePDL() {
#ifndef USE_ROCM
#define WARP_SIZE 32
#else
#define WARP_SIZE warpSize // 64
#include <ATen/cuda/CUDAContext.h>
#include <c10/macros/Macros.h>
#define WARP_SIZE C10_WARP_SIZE
#endif
#if
def
ined(__HIP_PLATFORM_AMD__)
#ifdef
USE_ROCM
#include "hip_math_def.h"
#include "hip_vec_dtypes.h"
#include "hip
/hip
_math_def.h"
#include "hip
/hip
_vec_dtypes.h"
#else
...
...
@@ -354,14 +356,11 @@ __device__ __forceinline__ dstDtype castFromFloat(float val) {
#endif
// add FP8 support
#ifndef USE_ROCM
#include <c10/util/Float8_e4m3fn.h>
using
FP8_TYPE
=
c10
::
Float8_e4m3fn
;
C10_HOST_DEVICE
constexpr
auto
FP8_E4M3_MAX
=
std
::
numeric_limits
<
FP8_TYPE
>::
max
();
#else // USE_ROCM
#if HIP_FP8_TYPE_FNUZ
#include <c10/util/Float8_e4m3fnuz.h>
using
FP8_TYPE
=
c10
::
Float8_e4m3fnuz
;
...
...
sgl-kernel/setup_rocm.py
View file @
c6c379ab
...
...
@@ -72,6 +72,9 @@ if amdgpu_target not in ["gfx942", "gfx950"]:
)
sys
.
exit
(
1
)
fp8_macro
=
(
"-DHIP_FP8_TYPE_FNUZ"
if
amdgpu_target
==
"gfx942"
else
"-DHIP_FP8_TYPE_E4M3"
)
hipcc_flags
=
[
"-DNDEBUG"
,
...
...
@@ -80,10 +83,10 @@ hipcc_flags = [
"-Xcompiler"
,
"-fPIC"
,
"-std=c++17"
,
"-D__HIP_PLATFORM_AMD__=1"
,
f
"--amdgpu-target=
{
amdgpu_target
}
"
,
"-DENABLE_BF16"
,
"-DENABLE_FP8"
,
fp8_macro
,
]
ext_modules
=
[
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment