Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
86c3b5a8
Unverified
Commit
86c3b5a8
authored
Feb 26, 2026
by
Roberto L. Castro
Committed by
GitHub
Feb 25, 2026
Browse files
[BugFix] Fix fp4 quant kernel on CUDA 12.8 (#35210)
Signed-off-by:
LopezCastroRoberto
<
rocastro@redhat.com
>
parent
160424a9
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
11 additions
and
7 deletions
+11
-7
csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
...quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+4
-2
csrc/quantization/fp4/nvfp4_quant_kernels.cu
csrc/quantization/fp4/nvfp4_quant_kernels.cu
+7
-5
No files found.
csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
View file @
86c3b5a8
...
...
@@ -107,7 +107,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
(
uint64_t
(
out_val
.
hi
)
<<
32
)
|
uint64_t
(
out_val
.
lo
);
reinterpret_cast
<
uint64_t
*>
(
out
)[
outOffset
>>
1
]
=
packed64
;
}
else
{
out
[
inOffset
]
=
out_val
;
int64_t
outOffset
=
rowIdx
*
(
numCols
/
CVT_FP4_ELTS_PER_THREAD
)
+
colIdx
;
out
[
outOffset
]
=
out_val
;
}
}
}
...
...
@@ -140,7 +142,7 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d]
int
const
numBlocksPerSM
=
vllm_runtime_blocks_per_sm
(
static_cast
<
int
>
(
block
.
x
));
int
sf_n_unpadded
=
int
(
n
/
CVT_FP4_
SF_VEC_SIZE
);
int
sf_n_unpadded
=
int
(
n
/
CVT_FP4_
ELTS_PER_THREAD
);
int
grid_y
=
vllm
::
div_round_up
(
sf_n_unpadded
,
static_cast
<
int
>
(
block
.
x
));
int
grid_x
=
std
::
min
(
...
...
csrc/quantization/fp4/nvfp4_quant_kernels.cu
View file @
86c3b5a8
...
...
@@ -109,7 +109,8 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
template
<
class
Type
,
bool
UE8M0_SF
=
false
>
__global__
void
__launch_bounds__
(
512
,
VLLM_BLOCKS_PER_SM
(
512
))
cvt_fp16_to_fp4_sf_major
(
int32_t
numRows
,
int32_t
numCols
,
int32_t
sf_n_unpadded
,
Type
const
*
__restrict__
in
,
int32_t
sf_n_unpadded
,
int32_t
num_packed_cols
,
Type
const
*
__restrict__
in
,
float
const
*
__restrict__
SFScale
,
uint32_t
*
__restrict__
out
,
uint32_t
*
__restrict__
SFout
)
{
...
...
@@ -131,7 +132,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
// Iterate over all rows and cols including padded ones -
// ensures we visit every single scale factor address to initialize it.
for
(
int
rowIdx
=
blockIdx
.
x
;
rowIdx
<
numRows
;
rowIdx
+=
gridDim
.
x
)
{
if
(
colIdx
<
sf_n_unpadded
)
{
if
(
colIdx
<
num_packed_cols
)
{
PackedVec
in_vec
;
int64_t
inOffset
=
rowIdx
*
(
numCols
/
CVT_FP4_ELTS_PER_THREAD
)
+
colIdx
;
...
...
@@ -222,7 +223,8 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
reinterpret_cast
<
uint32_t
*>
(
sf_out
));
});
}
else
{
int
grid_y
=
vllm
::
div_round_up
(
sf_n_unpadded
,
static_cast
<
int
>
(
block
.
x
));
int
num_packed_cols
=
n
/
CVT_FP4_ELTS_PER_THREAD
;
int
grid_y
=
vllm
::
div_round_up
(
num_packed_cols
,
static_cast
<
int
>
(
block
.
x
));
int
grid_x
=
std
::
min
(
m
,
std
::
max
(
1
,
(
multiProcessorCount
*
numBlocksPerSM
)
/
grid_y
));
dim3
grid
(
grid_x
,
grid_y
);
...
...
@@ -232,8 +234,8 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
auto
input_ptr
=
static_cast
<
cuda_type
const
*>
(
input
.
data_ptr
());
// NOTE: We don't support e8m0 scales at this moment.
vllm
::
cvt_fp16_to_fp4_sf_major
<
cuda_type
,
false
>
<<<
grid
,
block
,
0
,
stream
>>>
(
m
,
n
,
sf_n_unpadded
,
input_ptr
,
input_sf_ptr
,
<<<
grid
,
block
,
0
,
stream
>>>
(
m
,
n
,
sf_n_unpadded
,
num_packed_cols
,
input_ptr
,
input_sf_ptr
,
reinterpret_cast
<
uint32_t
*>
(
output_ptr
),
reinterpret_cast
<
uint32_t
*>
(
sf_out
));
});
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment