Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
fengzch-das
nunchaku
Commits
0a7c8614
Commit
0a7c8614
authored
Nov 21, 2025
by
fengzch-das
Browse files
Revert "hipify code"
This reverts commit
1a8114bf
parent
1a8114bf
Pipeline
#3050
failed with stages
in 0 seconds
Changes
50
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
130 additions
and
141 deletions
+130
-141
src/kernels/dwconv.cu
src/kernels/dwconv.cu
+4
-4
src/kernels/gemm_batched.cu
src/kernels/gemm_batched.cu
+0
-0
src/kernels/gemm_f16.cu
src/kernels/gemm_f16.cu
+0
-0
src/kernels/gemm_w8a8.cu
src/kernels/gemm_w8a8.cu
+0
-0
src/kernels/layernorm_kernels.cu
src/kernels/layernorm_kernels.cu
+21
-22
src/kernels/layernorm_kernels.h
src/kernels/layernorm_kernels.h
+1
-1
src/kernels/layernorm_kernels_impl.cuh
src/kernels/layernorm_kernels_impl.cuh
+1
-2
src/kernels/misc_kernels.cu
src/kernels/misc_kernels.cu
+29
-30
src/kernels/misc_kernels_impl.cuh
src/kernels/misc_kernels_impl.cuh
+2
-3
src/kernels/reduction_utils.cuh
src/kernels/reduction_utils.cuh
+0
-1
src/kernels/utils.cuh
src/kernels/utils.cuh
+41
-42
src/kernels/zgemm/attention.cu
src/kernels/zgemm/attention.cu
+3
-4
src/kernels/zgemm/attention.cuh
src/kernels/zgemm/attention.cuh
+4
-5
src/kernels/zgemm/epilogues.cuh
src/kernels/zgemm/epilogues.cuh
+2
-3
src/kernels/zgemm/gemm_base.cuh
src/kernels/zgemm/gemm_base.cuh
+10
-11
src/kernels/zgemm/gemm_utils.cuh
src/kernels/zgemm/gemm_utils.cuh
+4
-4
src/kernels/zgemm/gemm_w4a4.cu
src/kernels/zgemm/gemm_w4a4.cu
+0
-0
src/kernels/zgemm/gemm_w4a4.cuh
src/kernels/zgemm/gemm_w4a4.cuh
+8
-9
src/kernels/zgemm/gemm_w4a4_launch_bf16_fp4.cu
src/kernels/zgemm/gemm_w4a4_launch_bf16_fp4.cu
+0
-0
src/kernels/zgemm/gemm_w4a4_launch_bf16_int4.cu
src/kernels/zgemm/gemm_w4a4_launch_bf16_int4.cu
+0
-0
No files found.
src/kernels/dwconv.
hip
→
src/kernels/dwconv.
cu
View file @
0a7c8614
...
...
@@ -3,7 +3,7 @@
#include "dispatch_cutlass.h"
#include <
hip/hip
_runtime.h>
#include <
cuda
_runtime.h>
#include "cutlass/cutlass.h"
#include "cutlass/conv/device/direct_convolution.h"
...
...
@@ -74,7 +74,7 @@ static cutlass::Status depthwise_conv2d_kernel_run(cutlass::conv::Conv2dProblemS
UnderlyingKernel::ElementA *A, UnderlyingKernel::ElementB *B,
UnderlyingKernel::ElementC *C, UnderlyingKernel::ElementC *D,
ElementCompute alpha, ElementCompute beta, std::string split_k_mode,
hip
Stream_t stream, int device_id = 0)
cuda
Stream_t stream, int device_id = 0)
{
// create the tensor references
cutlass::Tensor4DCoord tensor_coord_A = cutlass::conv::implicit_gemm_tensor_a_extent(
...
...
@@ -183,7 +183,7 @@ Tensor depthwise_conv2d_kernel(Tensor A, Tensor B) {
Tensor D = Tensor::allocate({N, P, Q, K}, A.dtype(), A.device());
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto stream = getCurrent
CUDA
Stream();
cutlass::Status status = depthwise_conv2d_kernel_run(
&problem_size,
...
...
@@ -319,7 +319,7 @@ Tensor dwconv_f16(Tensor input, Tensor weight, Tensor out, Tensor bias) {
size_t
workspace_size
=
implicit_gemm_op
.
get_workspace_size
(
arguments
);
BufferCUDA
workspace
(
workspace_size
);
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto
stream
=
getCurrent
CUDA
Stream
();
cutlass
::
Status
status
=
implicit_gemm_op
.
can_implement
(
arguments
);
if
(
status
!=
cutlass
::
Status
::
kSuccess
)
{
...
...
src/kernels/gemm_batched.
hip
→
src/kernels/gemm_batched.
cu
View file @
0a7c8614
File moved
src/kernels/gemm_f16.
hip
→
src/kernels/gemm_f16.
cu
View file @
0a7c8614
File moved
src/kernels/gemm_w8a8.
hip
→
src/kernels/gemm_w8a8.
cu
View file @
0a7c8614
File moved
src/kernels/layernorm_kernels.
hip
→
src/kernels/layernorm_kernels.
cu
View file @
0a7c8614
#include "hip/hip_runtime.h"
#include "layernorm_kernels_impl.cuh"
#include "dispatch_utils.h"
...
...
@@ -11,17 +10,17 @@ void rms_norm(Tensor &out, // [..., hidden_size]
int
num_tokens
=
input
.
numel
()
/
hidden_size
;
dim3
grid
(
num_tokens
);
dim3
block
(
std
::
min
(
hidden_size
,
1024
));
const
hip
Stream_t stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const
cuda
Stream_t
stream
=
getCurrent
CUDA
Stream
();
VLLM_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"rms_norm_kernel"
,
[
&
]
{
if
(
use_quant
)
{
hipLaunchKernelGGL((
vllm::rms_norm_kernel<scalar_t, int8_t, true>
), dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
out.data_ptr<int8_t>(),
vllm
::
rms_norm_kernel
<
scalar_t
,
int8_t
,
true
>
<<<
grid
,
block
,
0
,
stream
>>>
(
out
.
data_ptr
<
int8_t
>
(),
input
.
data_ptr
<
scalar_t
>
(),
weight
.
data_ptr
<
scalar_t
>
(),
epsilon
,
num_tokens
,
hidden_size
);
}
else
{
hipLaunchKernelGGL((
vllm::rms_norm_kernel<scalar_t, scalar_t, false>
), dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
out.data_ptr<scalar_t>(),
vllm
::
rms_norm_kernel
<
scalar_t
,
scalar_t
,
false
>
<<<
grid
,
block
,
0
,
stream
>>>
(
out
.
data_ptr
<
scalar_t
>
(),
input
.
data_ptr
<
scalar_t
>
(),
weight
.
data_ptr
<
scalar_t
>
(),
epsilon
,
...
...
@@ -40,10 +39,10 @@ void layernorm_general(Tensor out, Tensor input, Tensor weight, Tensor bias, flo
size_t
size_shmem
=
input
.
scalar_size
()
*
hidden_size
;
const
hip
Stream_t stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const
cuda
Stream_t
stream
=
getCurrent
CUDA
Stream
();
VLLM_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"generalLayerNorm"
,
[
&
]
{
using
T
=
typename
packed_as
<
scalar_t
,
2
>::
type
;
hipLaunchKernelGGL((
vllm::generalLayerNorm<T, half, true>
), dim3(
grid
)
,
dim3(
block
)
, size_shmem, stream
,
vllm
::
generalLayerNorm
<
T
,
half
,
true
>
<<<
grid
,
block
,
size_shmem
,
stream
>>>
(
reinterpret_cast
<
T
*>
(
input
.
data_ptr
<
scalar_t
>
()),
weight
.
valid
()
?
reinterpret_cast
<
T
*>
(
weight
.
data_ptr
<
scalar_t
>
())
:
nullptr
,
bias
.
valid
()
?
reinterpret_cast
<
T
*>
(
bias
.
data_ptr
<
scalar_t
>
())
:
nullptr
,
...
...
@@ -70,13 +69,13 @@ void rms_norm_general(Tensor &out, // [..., hidden_size]
dim3
block
(
std
::
min
(
hidden_size
,
1024
));
block
.
x
=
32
*
((
block
.
x
+
31
)
/
32
);
const
hip
Stream_t stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const
cuda
Stream_t
stream
=
getCurrent
CUDA
Stream
();
VLLM_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"generalLayerNorm"
,
[
&
]
{
using
T
=
scalar_t
;
if
(
use_per_token_quant
)
{
// per-token
hipLaunchKernelGGL((
vllm::generalLayerNorm<T, half>
)
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
vllm
::
generalLayerNorm
<
T
,
half
>
<<<
grid
,
block
,
0
,
stream
>>>
(
reinterpret_cast
<
T
*>
(
input
.
data_ptr
<
scalar_t
>
()),
reinterpret_cast
<
T
*>
(
weight
.
data_ptr
<
scalar_t
>
()),
nullptr
,
nullptr
,
...
...
@@ -93,8 +92,8 @@ void rms_norm_general(Tensor &out, // [..., hidden_size]
// weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
}
else
{
// per-tensor
hipLaunchKernelGGL((
vllm::generalLayerNorm<T, half>
)
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
vllm
::
generalLayerNorm
<
T
,
half
>
<<<
grid
,
block
,
0
,
stream
>>>
(
reinterpret_cast
<
T
*>
(
input
.
data_ptr
<
scalar_t
>
()),
reinterpret_cast
<
T
*>
(
weight
.
data_ptr
<
scalar_t
>
()),
nullptr
,
nullptr
,
...
...
@@ -122,13 +121,13 @@ void rms_norm_general_fuse_sum(Tensor &out, // [..., hidden_size]
dim3
block
(
std
::
min
(
hidden_size
,
1024
));
block
.
x
=
32
*
((
block
.
x
+
31
)
/
32
);
const
hip
Stream_t stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const
cuda
Stream_t
stream
=
getCurrent
CUDA
Stream
();
VLLM_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"generalLayerNorm_fuse_sum"
,
[
&
]
{
using
T
=
scalar_t
;
if
(
use_per_token_quant
)
{
// per-token
hipLaunchKernelGGL((
vllm::generalLayerNorm_fuse_sum<T, half>
)
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
vllm
::
generalLayerNorm_fuse_sum
<
T
,
half
>
<<<
grid
,
block
,
0
,
stream
>>>
(
reinterpret_cast
<
T
*>
(
input
.
data_ptr
<
scalar_t
>
()),
reinterpret_cast
<
T
*>
(
weight
.
data_ptr
<
scalar_t
>
()),
nullptr
,
nullptr
,
...
...
@@ -150,8 +149,8 @@ void rms_norm_general_fuse_sum(Tensor &out, // [..., hidden_size]
// Not implemented per-tensor input_sum
assert
(
false
);
hipLaunchKernelGGL((
vllm::generalLayerNorm_fuse_sum<T, half>
)
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
vllm
::
generalLayerNorm_fuse_sum
<
T
,
half
>
<<<
grid
,
block
,
0
,
stream
>>>
(
reinterpret_cast
<
T
*>
(
input
.
data_ptr
<
scalar_t
>
()),
reinterpret_cast
<
T
*>
(
weight
.
data_ptr
<
scalar_t
>
()),
nullptr
,
nullptr
,
...
...
@@ -177,10 +176,10 @@ void invoke_dequant_add_residual_rms_norm_quant(Tensor &out, // [..., hidde
int
num_tokens
=
input
.
numel
()
/
hidden_size
;
dim3
grid
(
num_tokens
);
dim3
block
(
std
::
min
(
hidden_size
,
1024
));
const
hip
Stream_t stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const
cuda
Stream_t
stream
=
getCurrent
CUDA
Stream
();
VLLM_DISPATCH_FLOATING_TYPES
(
residual
.
scalar_type
(),
"dequant_add_residual_rms_norm_quant_kernel"
,
[
&
]
{
hipLaunchKernelGGL((
vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half, false>
)
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
input.data_ptr<int32_t>(),
vllm
::
dequant_add_residual_rms_norm_quant_kernel
<
scalar_t
,
half
,
false
>
<<<
grid
,
block
,
0
,
stream
>>>
(
input
.
data_ptr
<
int32_t
>
(),
residual
.
data_ptr
<
scalar_t
>
(),
out
.
data_ptr
<
int8_t
>
(),
gamma
.
data_ptr
<
scalar_t
>
(),
...
...
@@ -203,10 +202,10 @@ void invoke_dequant_add_residual_rms_norm_quant(Tensor &out, // [..., hidde
dim3
grid
(
num_tokens
);
dim3
block
(
std
::
min
(
hidden_size
,
1024
));
const
hip
Stream_t stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
const
cuda
Stream_t
stream
=
getCurrent
CUDA
Stream
();
VLLM_DISPATCH_FLOATING_TYPES
(
residual
.
scalar_type
(),
"dequant_add_residual_rms_norm_quant_kernel"
,
[
&
]
{
hipLaunchKernelGGL((
vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half *, true>
)
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
input.data_ptr<int32_t>(),
vllm
::
dequant_add_residual_rms_norm_quant_kernel
<
scalar_t
,
half
*
,
true
>
<<<
grid
,
block
,
0
,
stream
>>>
(
input
.
data_ptr
<
int32_t
>
(),
residual
.
data_ptr
<
scalar_t
>
(),
out
.
data_ptr
<
int8_t
>
(),
gamma
.
data_ptr
<
scalar_t
>
(),
...
...
src/kernels/layernorm_kernels.h
View file @
0a7c8614
...
...
@@ -2,7 +2,7 @@
#include "common.h"
#include "Tensor.h"
#include <
hip/hip
_fp16.h>
#include <
cuda
_fp16.h>
void
rms_norm
(
Tensor
&
out
,
// [num_tokens, hidden_size]
Tensor
&
input
,
// [num_tokens, hidden_size]
...
...
src/kernels/layernorm_kernels_impl.cuh
View file @
0a7c8614
#include "hip/hip_runtime.h"
#include <hip/hip_bf16.h>
#include <cuda_bf16.h>
#define ENABLE_BF16 1
...
...
src/kernels/misc_kernels.
hip
→
src/kernels/misc_kernels.
cu
View file @
0a7c8614
#include "hip/hip_runtime.h"
#include "misc_kernels_impl.cuh"
#include "misc_kernels.h"
#include "dispatch_utils.h"
...
...
@@ -14,12 +13,12 @@ Tensor add(Tensor a, Tensor b) {
int
threadsPerBlock
=
1024
;
int
blocksPerGrid
=
(
a
.
numel
()
+
threadsPerBlock
-
1
)
/
threadsPerBlock
;
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto
stream
=
getCurrent
CUDA
Stream
();
Tensor
out
=
Tensor
::
empty_like
(
a
);
dispatch
(
out
.
scalar_type
(),
[
&
]
<
typename
scalar_t
>
()
{
hipLaunchKernelGGL((
add_kernel
), dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
add_kernel
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
a
.
data_ptr
<
scalar_t
>
(),
b
.
data_ptr
<
scalar_t
>
(),
out
.
data_ptr
<
scalar_t
>
(),
out
.
numel
());
});
...
...
@@ -47,12 +46,12 @@ void mul_add(Tensor x, Tensor scale, Tensor bias) {
int
threadsPerBlock
=
1024
;
int
blocksPerGrid
=
(
x
.
numel
()
+
threadsPerBlock
*
unroll
-
1
)
/
(
threadsPerBlock
*
unroll
);
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto
stream
=
getCurrent
CUDA
Stream
();
dispatch
(
x
.
scalar_type
(),
[
&
]
<
typename
scalar_t
>
()
{
if
(
scale
.
valid
())
{
hipLaunchKernelGGL((
mul_add_kernel<scalar_t, unroll, false>
)
, dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
x.data_ptr<scalar_t>(),
mul_add_kernel
<
scalar_t
,
unroll
,
false
>
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
x
.
data_ptr
<
scalar_t
>
(),
scale
.
data_ptr
<
scalar_t
>
(),
bias
.
data_ptr
<
scalar_t
>
(),
0
,
...
...
@@ -63,7 +62,7 @@ void mul_add(Tensor x, Tensor scale, Tensor bias) {
0
,
0
);
}
else
{
hipLaunchKernelGGL((
mul_add_kernel<scalar_t, unroll, true>
), dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
mul_add_kernel
<
scalar_t
,
unroll
,
true
>
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
x
.
data_ptr
<
scalar_t
>
(),
nullptr
,
bias
.
data_ptr
<
scalar_t
>
(),
0
,
x
.
numel
(),
1
,
bias
.
numel
(),
0
,
0
,
0
);
}
});
...
...
@@ -97,12 +96,12 @@ void mul_add_batch(Tensor x, Tensor scale, bool batch_scale, double scale_shift,
int
threadsPerBlock
=
1024
;
dim3
grid
(
ceilDiv
(
numel
,
threadsPerBlock
*
unroll
),
batch_size
);
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto
stream
=
getCurrent
CUDA
Stream
();
dispatch
(
x
.
scalar_type
(),
[
&
]
<
typename
scalar_t
>
()
{
if
(
scale
.
valid
())
{
hipLaunchKernelGGL((
mul_add_kernel<scalar_t, unroll, false>
)
, dim3(
grid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
x.data_ptr<scalar_t>(),
mul_add_kernel
<
scalar_t
,
unroll
,
false
>
<<<
grid
,
threadsPerBlock
,
0
,
stream
>>>
(
x
.
data_ptr
<
scalar_t
>
(),
scale
.
data_ptr
<
scalar_t
>
(),
bias
.
data_ptr
<
scalar_t
>
(),
(
scalar_t
)
scale_shift
,
...
...
@@ -113,8 +112,8 @@ void mul_add_batch(Tensor x, Tensor scale, bool batch_scale, double scale_shift,
batch_scale
?
scale
.
stride
(
0
)
:
0
,
batch_bias
?
bias
.
stride
(
0
)
:
0
);
}
else
{
hipLaunchKernelGGL((
mul_add_kernel<scalar_t, unroll, true>
)
, dim3(
grid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
x.data_ptr<scalar_t>(),
mul_add_kernel
<
scalar_t
,
unroll
,
true
>
<<<
grid
,
threadsPerBlock
,
0
,
stream
>>>
(
x
.
data_ptr
<
scalar_t
>
(),
nullptr
,
bias
.
data_ptr
<
scalar_t
>
(),
(
scalar_t
)
scale_shift
,
...
...
@@ -135,12 +134,12 @@ Tensor embedding(Tensor input_id, Tensor lookup) {
auto
shapeOut
=
input_id
.
shape
;
shapeOut
.
dataExtent
.
push_back
(
lookup
.
shape
[
-
1
]);
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto
stream
=
getCurrent
CUDA
Stream
();
Tensor
out
=
Tensor
::
empty
(
shapeOut
,
lookup
.
scalar_type
(),
input_id
.
device
());
dispatch
(
out
.
scalar_type
(),
[
&
]
<
typename
scalar_t
>
()
{
hipLaunchKernelGGL((
EmbeddingKernel
), dim3(
input_id.numel()
)
,
dim3(
std::min(lookup.shape[-1], 1024)
)
, 0, stream
,
EmbeddingKernel
<<<
input_id
.
numel
(),
std
::
min
(
lookup
.
shape
[
-
1
],
1024
),
0
,
stream
>>>
(
input_id
.
data_ptr
<
int32_t
>
(),
out
.
data_ptr
<
scalar_t
>
(),
lookup
.
data_ptr
<
scalar_t
>
(),
lookup
.
shape
[
-
1
]);
});
...
...
@@ -150,12 +149,12 @@ Tensor embedding(Tensor input_id, Tensor lookup) {
Tensor
argmax_sample
(
Tensor
logits
)
{
assert
(
logits
.
ndims
()
==
2
);
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto
stream
=
getCurrent
CUDA
Stream
();
Tensor
out
=
Tensor
::
empty
({
logits
.
shape
[
0
]},
Tensor
::
INT32
,
logits
.
device
());
dispatch
(
logits
.
scalar_type
(),
[
&
]
<
typename
scalar_t
>
()
{
hipLaunchKernelGGL((
argmax_sample_kernel
), dim3(
logits.shape[0]
)
,
dim3(
std::min(logits.shape[1], 1024)
)
, 0, stream
,
argmax_sample_kernel
<<<
logits
.
shape
[
0
],
std
::
min
(
logits
.
shape
[
1
],
1024
),
0
,
stream
>>>
(
logits
.
data_ptr
<
scalar_t
>
(),
out
.
data_ptr
<
int32_t
>
(),
logits
.
shape
[
1
]);
});
...
...
@@ -168,7 +167,7 @@ void splitqkv(Tensor qkv, Tensor q, Tensor k, Tensor v) {
// assert(qkv.shape[0] == k.shape[0]);
// assert(qkv.shape[0] == v.shape[0]);
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto
stream
=
getCurrent
CUDA
Stream
();
int
dim_q
=
q
.
shape
[
-
1
]
*
q
.
shape
[
-
2
];
int
dim_k
=
k
.
shape
[
-
1
]
*
k
.
shape
[
-
2
];
...
...
@@ -180,7 +179,7 @@ void splitqkv(Tensor qkv, Tensor q, Tensor k, Tensor v) {
int
num_tokens
=
qkv
.
numel
()
/
qkv
.
shape
[
-
1
];
dispatch
(
qkv
.
scalar_type
(),
[
&
]
<
typename
scalar_t
>
()
{
hipLaunchKernelGGL((
splitqkv_kernel
), dim3(
num_tokens
)
,
dim3(
std::min(qkv.shape[-1], 1024)
)
, 0, stream
,
qkv.data_ptr<scalar_t>(),
splitqkv_kernel
<<<
num_tokens
,
std
::
min
(
qkv
.
shape
[
-
1
],
1024
),
0
,
stream
>>>
(
qkv
.
data_ptr
<
scalar_t
>
(),
q
.
data_ptr
<
scalar_t
>
(),
k
.
data_ptr
<
scalar_t
>
(),
v
.
data_ptr
<
scalar_t
>
(),
...
...
@@ -196,7 +195,7 @@ std::array<Tensor, N> split_mod(Tensor input) {
int
threadsPerBlock
=
1024
;
int
blocksPerGrid
=
(
input
.
numel
()
+
threadsPerBlock
-
1
)
/
threadsPerBlock
;
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto
stream
=
getCurrent
CUDA
Stream
();
auto
shapeOut
=
TensorShape
(
input
.
shape
.
dataExtent
);
shapeOut
[
-
1
]
/=
N
;
...
...
@@ -211,7 +210,7 @@ std::array<Tensor, N> split_mod(Tensor input) {
for
(
int
k
=
0
;
k
<
N
;
k
++
)
{
outPtr
[
k
]
=
out
[
k
].
template
data_ptr
<
scalar_t
>();
}
hipLaunchKernelGGL((
split_mod_kernel
), dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
split_mod_kernel
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
input
.
data_ptr
<
scalar_t
>
(),
outPtr
,
input
.
numel
());
});
...
...
@@ -228,10 +227,10 @@ Tensor quant_static(Tensor x, float scale) {
int
threadsPerBlock
=
1024
;
int
blocksPerGrid
=
(
x
.
numel
()
+
threadsPerBlock
*
unroll
-
1
)
/
(
threadsPerBlock
*
unroll
);
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto
stream
=
getCurrent
CUDA
Stream
();
dispatch
(
x
.
scalar_type
(),
[
&
]
<
typename
scalar_t
>
()
{
hipLaunchKernelGGL((
quant_kernel_static<scalar_t, unroll>
), dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
quant_kernel_static
<
scalar_t
,
unroll
>
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
x
.
data_ptr
<
scalar_t
>
(),
out
.
data_ptr
<
int8_t
>
(),
(
scalar_t
)
scale
,
x
.
numel
());
});
...
...
@@ -248,10 +247,10 @@ Tensor quant_static_fuse_gelu(Tensor x, float scale) {
int
threadsPerBlock
=
1024
;
int
blocksPerGrid
=
(
x
.
numel
()
+
threadsPerBlock
*
unroll
-
1
)
/
(
threadsPerBlock
*
unroll
);
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto
stream
=
getCurrent
CUDA
Stream
();
dispatch
(
x
.
scalar_type
(),
[
&
]
<
typename
scalar_t
>
()
{
hipLaunchKernelGGL((
quant_kernel_static_fuse_gelu<scalar_t, unroll>
), dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
quant_kernel_static_fuse_gelu
<
scalar_t
,
unroll
>
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
x
.
data_ptr
<
scalar_t
>
(),
out
.
data_ptr
<
int8_t
>
(),
(
scalar_t
)
scale
,
x
.
numel
());
});
...
...
@@ -267,7 +266,7 @@ void cast(Tensor input, Tensor output) {
assert
(
input
.
scalar_size
()
==
output
.
scalar_size
());
}
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto
stream
=
getCurrent
CUDA
Stream
();
dispatch
(
input
.
scalar_type
(),
[
&
]
<
typename
input_t
>
()
{
dispatch
(
output
.
scalar_type
(),
[
&
]
<
typename
output_t
>
()
{
...
...
@@ -276,10 +275,10 @@ void cast(Tensor input, Tensor output) {
int
threadsPerBlock
=
1024
;
int
blocksPerGrid
=
(
int
)
ceilDiv
<
int64_t
>
(
input
.
numel
(),
threadsPerBlock
*
unroll
);
hipLaunchKernelGGL((
cast_kernel<input_t, output_t, unroll>
), dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
cast_kernel
<
input_t
,
output_t
,
unroll
>
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
input
.
data_ptr
<
input_t
>
(),
output
.
data_ptr
<
output_t
>
(),
input
.
numel
());
checkCUDA(
hip
GetLastError());
checkCUDA
(
cuda
GetLastError
());
});
});
}
...
...
@@ -299,7 +298,7 @@ Tensor topk(Tensor x, int k) {
Tensor
out
=
Tensor
::
empty
(
outShape
,
Tensor
::
INT32
,
x
.
device
());
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto
stream
=
getCurrent
CUDA
Stream
();
dispatchVal
(
k
,
std
::
make_integer_sequence
<
int
,
MAXK
+
1
>
(),
[
&
]
<
int
K
>
()
{
if
constexpr
(
K
==
0
)
{
...
...
@@ -308,9 +307,9 @@ Tensor topk(Tensor x, int k) {
}
if
constexpr
(
K
>
0
)
{
dispatch
(
x
.
scalar_type
(),
[
&
]
<
typename
scalar_t
>
()
{
hipLaunchKernelGGL((
topk_kernel<scalar_t, K>
), dim3(
ceilDiv(batch, 32)
)
,
dim3(
32
)
, 0, stream
,
topk_kernel
<
scalar_t
,
K
>
<<<
ceilDiv
(
batch
,
32
),
32
,
0
,
stream
>>>
(
x
.
data_ptr
<
scalar_t
>
(),
out
.
data_ptr
<
int
>
(),
N
,
x
.
stride
(
-
2
),
batch
);
checkCUDA(
hip
GetLastError());
checkCUDA
(
cuda
GetLastError
());
});
}
});
...
...
src/kernels/misc_kernels_impl.cuh
View file @
0a7c8614
#include "hip/hip_runtime.h"
#include "reduction_utils.cuh"
#include <array>
#include <
hip/hip
_fp16.h>
#include <
hip/hip
_bf16.h>
#include <
cuda
_fp16.h>
#include <
cuda
_bf16.h>
#include "utils.cuh"
#include "activation_kernels_impl.cuh"
...
...
src/kernels/reduction_utils.cuh
View file @
0a7c8614
#include "hip/hip_runtime.h"
/*
* Adapted from
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh
...
...
src/kernels/utils.cuh
View file @
0a7c8614
#include "hip/hip_runtime.h"
// Adated from FasterTransformer,
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
#pragma once
...
...
@@ -10,10 +9,10 @@
#include <cstdio>
#include <
hip/hip
_fp16.h>
#include <
cuda
_fp16.h>
#ifdef ENABLE_BF16
#include <
hip/hip
_bf16.h>
#include <
cuda
_bf16.h>
#endif
__device__
__forceinline__
static
void
trap_unsupported_arch
()
{
...
...
@@ -25,11 +24,11 @@ __device__ __forceinline__ static void trap_unsupported_arch() {
__trap
();
}
#if defined(ENABLE_BF16) && defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ < 800
__device__
__forceinline__
static
__
hip
_bfloat162
__hfma2
(
const
__
hip
_bfloat162
a
,
const
__
hip
_bfloat162
b
,
const
__
hip
_bfloat162
c
)
{
#if defined(ENABLE_BF16) && defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ < 800
__device__
__forceinline__
static
__
nv
_bfloat162
__hfma2
(
const
__
nv
_bfloat162
a
,
const
__
nv
_bfloat162
b
,
const
__
nv
_bfloat162
c
)
{
trap_unsupported_arch
();
return
__
hip
_bfloat162
(
0.0
f
,
0.0
f
);
return
__
nv
_bfloat162
(
0.0
f
,
0.0
f
);
}
#endif
...
...
@@ -57,11 +56,11 @@ struct num_elems<half2> {
};
#ifdef ENABLE_BF16
template
<
>
struct
num_elems
<
__
hip
_bfloat16
>
{
struct
num_elems
<
__
nv
_bfloat16
>
{
static
constexpr
int
value
=
1
;
};
template
<
>
struct
num_elems
<
__
hip
_bfloat162
>
{
struct
num_elems
<
__
nv
_bfloat162
>
{
static
constexpr
int
value
=
2
;
};
#endif
...
...
@@ -108,12 +107,12 @@ struct packed_as<float2, 1> {
};
#ifdef ENABLE_BF16
template
<
>
struct
packed_as
<
__
hip
_bfloat16
,
2
>
{
using
type
=
__
hip
_bfloat162
;
struct
packed_as
<
__
nv
_bfloat16
,
2
>
{
using
type
=
__
nv
_bfloat162
;
};
template
<
>
struct
packed_as
<
__
hip
_bfloat162
,
1
>
{
using
type
=
__
hip
_bfloat16
;
struct
packed_as
<
__
nv
_bfloat162
,
1
>
{
using
type
=
__
nv
_bfloat16
;
};
#endif
#ifdef ENABLE_FP8
...
...
@@ -170,8 +169,8 @@ inline __device__ T ldg(const T *val) {
#define bf1622float2 __bfloat1622float2
#define float22bf162 __float22bfloat162_rn
#define bf162bf162 __bfloat162bfloat162
inline
__device__
int16_t
bf1622int16
(
__
hip
_bfloat162
val
)
{
#if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ < 800
inline
__device__
int16_t
bf1622int16
(
__
nv
_bfloat162
val
)
{
#if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ < 800
float2
f_val
;
f_val
.
x
=
max
(
min
(
__low2float
(
val
),
127.
f
),
-
128.
f
);
f_val
.
y
=
max
(
min
(
__high2float
(
val
),
127.
f
),
-
128.
f
);
...
...
@@ -202,8 +201,8 @@ inline __device__ int16_t bf1622int16(__hip_bfloat162 val) {
#if ENABLE_BF16
template
<
>
inline
__device__
__
hip
_bfloat162
ldg
(
const
__
hip
_bfloat162
*
val
)
{
#if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ < 800
inline
__device__
__
nv
_bfloat162
ldg
(
const
__
nv
_bfloat162
*
val
)
{
#if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ < 800
return
val
[
0
];
#else
return
__ldg
(
val
);
...
...
@@ -211,8 +210,8 @@ inline __device__ __hip_bfloat162 ldg(const __hip_bfloat162 *val) {
}
template
<
>
inline
__device__
__
hip
_bfloat16
ldg
(
const
__
hip
_bfloat16
*
val
)
{
#if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ < 800
inline
__device__
__
nv
_bfloat16
ldg
(
const
__
nv
_bfloat16
*
val
)
{
#if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ < 800
return
val
[
0
];
#else
return
__ldg
(
val
);
...
...
@@ -331,81 +330,81 @@ __device__ inline float2 cuda_cast<float2, int16_t>(int16_t val) {
#ifdef ENABLE_BF16
template
<
>
__device__
inline
__
hip
_bfloat16
cuda_cast
(
int32_t
val
)
{
__device__
inline
__
nv
_bfloat16
cuda_cast
(
int32_t
val
)
{
return
static_cast
<
float
>
(
val
);
}
template
<
>
__device__
inline
__
hip
_bfloat16
cuda_cast
(
int8_t
val
)
{
__device__
inline
__
nv
_bfloat16
cuda_cast
(
int8_t
val
)
{
return
static_cast
<
float
>
(
val
);
}
template
<
>
__device__
inline
int8_t
cuda_cast
(
__
hip
_bfloat16
val
)
{
__device__
inline
int8_t
cuda_cast
(
__
nv
_bfloat16
val
)
{
return
static_cast
<
float
>
(
val
);
}
template
<
>
__device__
inline
float
cuda_cast
<
float
,
__
hip
_bfloat16
>
(
__
hip
_bfloat16
val
)
{
__device__
inline
float
cuda_cast
<
float
,
__
nv
_bfloat16
>
(
__
nv
_bfloat16
val
)
{
return
__bfloat162float
(
val
);
}
template
<
>
__device__
inline
float2
cuda_cast
<
float2
,
__
hip
_bfloat162
>
(
__
hip
_bfloat162
val
)
{
__device__
inline
float2
cuda_cast
<
float2
,
__
nv
_bfloat162
>
(
__
nv
_bfloat162
val
)
{
return
bf1622float2
(
val
);
}
template
<
>
__device__
inline
half
cuda_cast
<
half
,
__
hip
_bfloat16
>
(
__
hip
_bfloat16
val
)
{
__device__
inline
half
cuda_cast
<
half
,
__
nv
_bfloat16
>
(
__
nv
_bfloat16
val
)
{
return
__float2half
(
__bfloat162float
(
val
));
}
template
<
>
__device__
inline
int16_t
cuda_cast
<
int16_t
,
__
hip
_bfloat162
>
(
__
hip
_bfloat162
val
)
{
__device__
inline
int16_t
cuda_cast
<
int16_t
,
__
nv
_bfloat162
>
(
__
nv
_bfloat162
val
)
{
return
bf1622int16
(
val
);
}
template
<
>
__device__
inline
__
hip
_bfloat16
cuda_cast
<
__
hip
_bfloat16
,
float
>
(
float
val
)
{
__device__
inline
__
nv
_bfloat16
cuda_cast
<
__
nv
_bfloat16
,
float
>
(
float
val
)
{
return
__float2bfloat16
(
val
);
}
template
<
>
__device__
inline
__
hip
_bfloat16
cuda_cast
<
__
hip
_bfloat16
,
half
>
(
half
val
)
{
__device__
inline
__
nv
_bfloat16
cuda_cast
<
__
nv
_bfloat16
,
half
>
(
half
val
)
{
return
__float2bfloat16
(
__half2float
(
val
));
}
template
<
>
__device__
inline
__
hip
_bfloat162
cuda_cast
<
__
hip
_bfloat162
,
__
hip
_bfloat16
>
(
__
hip
_bfloat16
val
)
{
__device__
inline
__
nv
_bfloat162
cuda_cast
<
__
nv
_bfloat162
,
__
nv
_bfloat16
>
(
__
nv
_bfloat16
val
)
{
return
bf162bf162
(
val
);
}
template
<
>
__device__
inline
__
hip
_bfloat162
cuda_cast
<
__
hip
_bfloat162
,
float
>
(
float
val
)
{
__device__
inline
__
nv
_bfloat162
cuda_cast
<
__
nv
_bfloat162
,
float
>
(
float
val
)
{
return
__float2bfloat162_rn
(
val
);
}
template
<
>
__device__
inline
__
hip
_bfloat162
cuda_cast
<
__
hip
_bfloat162
,
float2
>
(
float2
val
)
{
__device__
inline
__
nv
_bfloat162
cuda_cast
<
__
nv
_bfloat162
,
float2
>
(
float2
val
)
{
return
float22bf162
(
val
);
}
template
<
>
__device__
inline
__
hip
_bfloat162
cuda_cast
<
__
hip
_bfloat162
,
int16_t
>
(
int16_t
val
)
{
__device__
inline
__
nv
_bfloat162
cuda_cast
<
__
nv
_bfloat162
,
int16_t
>
(
int16_t
val
)
{
union
{
int8_t
int8
[
2
];
int16_t
int16
;
};
int16
=
val
;
__
hip
_bfloat162
res
;
res
.
x
=
cuda_cast
<
__
hip
_bfloat16
>
(
int8
[
0
]);
res
.
y
=
cuda_cast
<
__
hip
_bfloat16
>
(
int8
[
1
]);
__
nv
_bfloat162
res
;
res
.
x
=
cuda_cast
<
__
nv
_bfloat16
>
(
int8
[
0
]);
res
.
y
=
cuda_cast
<
__
nv
_bfloat16
>
(
int8
[
1
]);
return
res
;
}
template
<
>
__device__
inline
__
hip
_bfloat162
cuda_cast
<
__
hip
_bfloat162
,
half2
>
(
half2
val
)
{
__device__
inline
__
nv
_bfloat162
cuda_cast
<
__
nv
_bfloat162
,
half2
>
(
half2
val
)
{
return
float22bf162
(
__half22float2
(
val
));
}
...
...
@@ -421,7 +420,7 @@ __device__ __forceinline__ packed_as<half, 2>::type f162f162<half>(half x) {
#ifdef ENABLE_BF16
template
<
>
__device__
__forceinline__
packed_as
<
__
hip
_bfloat16
,
2
>::
type
f162f162
<
__
hip
_bfloat16
>
(
__
hip
_bfloat16
x
)
{
__device__
__forceinline__
packed_as
<
__
nv
_bfloat16
,
2
>::
type
f162f162
<
__
nv
_bfloat16
>
(
__
nv
_bfloat16
x
)
{
return
__bfloat162bfloat162
(
x
);
}
#endif
...
...
@@ -454,8 +453,8 @@ __device__ inline half cuda_max(half2 val) {
#ifdef ENABLE_BF16
template
<
>
__device__
inline
__
hip
_bfloat16
cuda_max
(
__
hip
_bfloat162
val
)
{
#if (defined(__
DTK
_ARCH__) && (__
DTK
_ARCH__ >= 800))
__device__
inline
__
nv
_bfloat16
cuda_max
(
__
nv
_bfloat162
val
)
{
#if (defined(__
CUDA
_ARCH__) && (__
CUDA
_ARCH__ >= 800))
return
__hmax
(
val
.
x
,
val
.
y
);
#else
assert
(
false
);
...
...
@@ -498,14 +497,14 @@ __device__ inline half2 cuda_abs(half2 val) {
#ifdef ENABLE_BF16
#if __
DTK
_ARCH__ >= 800 || !defined(__
DTK
_ARCH__)
#if __
CUDA
_ARCH__ >= 800 || !defined(__
CUDA
_ARCH__)
template
<
>
__device__
inline
__
hip
_bfloat16
cuda_abs
(
__
hip
_bfloat16
val
)
{
__device__
inline
__
nv
_bfloat16
cuda_abs
(
__
nv
_bfloat16
val
)
{
return
__habs
(
val
);
}
template
<
>
__device__
inline
__
hip
_bfloat162
cuda_abs
(
__
hip
_bfloat162
val
)
{
__device__
inline
__
nv
_bfloat162
cuda_abs
(
__
nv
_bfloat162
val
)
{
return
__habs2
(
val
);
}
#endif
...
...
src/kernels/zgemm/attention.
hip
→
src/kernels/zgemm/attention.
cu
View file @
0a7c8614
#include "hip/hip_runtime.h"
#include "zgemm.h"
#include "attention.cuh"
...
...
@@ -72,10 +71,10 @@ void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM]
shmem
=
std
::
max
(
shmem
,
Attention
::
template
attention_fp16_kernel
<
Epilogue
>
::
SHMEM_SIZE
);
if
(
shmem
>=
24
*
1024
)
{
checkCUDA(
hip
FuncSetAttribute(func,
hip
FuncAttributeMaxDynamicSharedMemorySize, shmem));
checkCUDA
(
cuda
FuncSetAttribute
(
func
,
cuda
FuncAttributeMaxDynamicSharedMemorySize
,
shmem
));
}
hipLaunchKernelGGL(( func), dim3(
grid
)
,
dim3(
GEMM::WARP_SIZE * GEMM::NUM_WARPS
)
, shmem, getCurrent
HIP
Stream
MasqueradingAsCUDA(),
q.data_ptr<packed_q_t>(),
func
<<<
grid
,
GEMM
::
WARP_SIZE
*
GEMM
::
NUM_WARPS
,
shmem
,
getCurrent
CUDA
Stream
()
>>>
(
q
.
data_ptr
<
packed_q_t
>
(),
k
.
data_ptr
<
packed_k_t
>
(),
v
.
data_ptr
<
packed_v_t
>
(),
scale
,
...
...
@@ -83,7 +82,7 @@ void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM]
numTokensKV
,
args
,
false
);
checkCUDA(
hip
GetLastError());
checkCUDA
(
cuda
GetLastError
());
};
launch
.
template
operator
()
<
typename
GEMM
::
EpilogueDefault
>(
typename
GEMM
::
EpilogueDefault
::
Arguments
{
...
...
src/kernels/zgemm/attention.cuh
View file @
0a7c8614
#include "hip/hip_runtime.h"
#pragma once
#include "gemm_base.cuh"
...
...
@@ -27,8 +26,8 @@ struct AttentionFP16Config {
using
half_t
=
half
;
using
half2_t
=
half2
;
using
epilogue_half_t
=
typename
std
::
conditional_t
<
bf16out
,
__
hip
_bfloat16
,
half
>
;
using
epilogue_half2_t
=
typename
std
::
conditional_t
<
bf16out
,
__
hip
_bfloat162
,
half2
>
;
using
epilogue_half_t
=
typename
std
::
conditional_t
<
bf16out
,
__
nv
_bfloat16
,
half
>
;
using
epilogue_half2_t
=
typename
std
::
conditional_t
<
bf16out
,
__
nv
_bfloat162
,
half2
>
;
};
using
AttentionFP16Config_FP16
=
AttentionFP16Config
<
false
>
;
...
...
@@ -61,7 +60,7 @@ public:
using
typename
AttentionConfig
::
epilogue_half_t
;
using
typename
AttentionConfig
::
epilogue_half2_t
;
#if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ >= 800
#if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ >= 800
static
constexpr
bool
IS_SM80
=
true
;
#else
static
constexpr
bool
IS_SM80
=
false
;
...
...
@@ -658,7 +657,7 @@ public:
template
<
typename
Epilogue
>
struct
attention_fp16_kernel
{
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
static
constexpr
int
SHMEM_SIZE
=
0
;
// sizeof(q_shmem_t);
__device__
void
operator
()(
const
packed_q_t
*
ptr_q
,
...
...
src/kernels/zgemm/epilogues.cuh
View file @
0a7c8614
#include "hip/hip_runtime.h"
#pragma once
#include "gemm_base.cuh"
...
...
@@ -703,7 +702,7 @@ public:
// q: [batch_size, #blocks, block_size, #heads, HEAD_DIM]
// vk: [batch_size, #heads, HEAD_DIM+1, HEAD_DIM]
struct
vk_mul_q_kernel
{
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
// FIXME FIXME FIXME
__device__
void
operator
()(
half_t
*
q
,
const
float
*
vk
,
float
eps
,
int
num_tokens
)
{
const
int
block_id
=
blockIdx
.
x
;
...
...
@@ -763,7 +762,7 @@ public:
template
<
typename
Epilogue
>
struct
test_epilogue_kernel
{
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
static
constexpr
size_t
SHMEM_PER_WARP
=
ceilDiv
<
size_t
>
(
Base
::
template
load_act_to_fpsum
<
false
>
::
SHMEM_SIZE
,
128
)
*
128
;
static
constexpr
size_t
SHMEM_SIZE
=
SHMEM_PER_WARP
*
NUM_WARPS
;
...
...
src/kernels/zgemm/gemm_base.cuh
View file @
0a7c8614
#include "hip/hip_runtime.h"
#pragma once
#include "common.h"
...
...
@@ -45,8 +44,8 @@ public:
// may generate incorrect results in certain circumstances
static
constexpr
bool
FASTER_I2F
=
faster_i2f
;
using
half_t
=
typename
std
::
conditional_t
<
bf16
,
__
hip
_bfloat16
,
half
>
;
using
half2_t
=
typename
std
::
conditional_t
<
bf16
,
__
hip
_bfloat162
,
half2
>
;
using
half_t
=
typename
std
::
conditional_t
<
bf16
,
__
nv
_bfloat16
,
half
>
;
using
half2_t
=
typename
std
::
conditional_t
<
bf16
,
__
nv
_bfloat162
,
half2
>
;
};
using
GEMMConfig_W4A4_FP16
=
GEMMConfig_W4A4
<
false
>
;
...
...
@@ -68,8 +67,8 @@ public:
using half_t = half;
using half2_t = half2;
#else
using
half_t
=
__
hip
_bfloat16
;
using
half2_t
=
__
hip
_bfloat162
;
using
half_t
=
__
nv
_bfloat16
;
using
half2_t
=
__
nv
_bfloat162
;
#endif
};
...
...
@@ -203,9 +202,9 @@ public:
__device__
__forceinline__
static
packed_f32psum_t
mma_f16xf16_f32
(
packed_fpsum_t
a
,
packed_fpsum_t
b
,
packed_f32psum_t
psum
)
{
static_assert
(
std
::
is_same_v
<
half_t
,
half
>
||
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
);
static_assert
(
std
::
is_same_v
<
half_t
,
half
>
||
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
);
static
constexpr
bool
is_bf16
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
;
static
constexpr
bool
is_bf16
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
;
uint4
out1
=
mma_m16n8k16_f32f16f16f32
<
is_bf16
>
(
kernels
::
bit_cast
<
uint4
>
(
a
),
...
...
@@ -891,8 +890,8 @@ constexpr int max_arch() {
template
<
typename
kernel
,
typename
...
T
>
__global__
static
void
invoke_kernel
(
T
...
args
)
{
#ifdef __
DTK
_ARCH__
if
constexpr
(
__
DTK
_ARCH__
>=
min_arch
<
kernel
>
()
&&
__
DTK
_ARCH__
<=
max_arch
<
kernel
>
())
{
#ifdef __
CUDA
_ARCH__
if
constexpr
(
__
CUDA
_ARCH__
>=
min_arch
<
kernel
>
()
&&
__
CUDA
_ARCH__
<=
max_arch
<
kernel
>
())
{
kernel
()(
args
...);
}
else
{
trap_unsupported_arch
();
...
...
@@ -917,8 +916,8 @@ template<typename T>
static
void
test_sizeof
()
{
printf
(
"typeid = %s
\n
"
,
typeid
(
T
).
name
());
test_sizeof_host
<
T
>
();
hipLaunchKernelGGL
((
test_sizeof_device
<
T
>
),
dim3
(
1
),
dim3
(
1
),
0
,
0
,
);
checkCUDA
(
hip
DeviceSynchronize
());
test_sizeof_device
<
T
>
<<<
1
,
1
>>>
(
);
checkCUDA
(
cuda
DeviceSynchronize
());
}
};
// namespace nunchaku::kernels
src/kernels/zgemm/gemm_utils.cuh
View file @
0a7c8614
...
...
@@ -163,7 +163,7 @@ __device__ __forceinline__ static float2 half22float2(half2 val) {
return
__half22float2
(
val
);
}
__device__
__forceinline__
static
float2
half22float2
(
__
hip
_bfloat162
val
)
{
__device__
__forceinline__
static
float2
half22float2
(
__
nv
_bfloat162
val
)
{
return
__bfloat1622float2
(
val
);
}
...
...
@@ -176,7 +176,7 @@ __device__ __forceinline__ half2 float22half2<half2>(float2 val) {
}
template
<
>
__device__
__forceinline__
__
hip
_bfloat162
float22half2
<
__
hip
_bfloat162
>
(
float2
val
)
{
__device__
__forceinline__
__
nv
_bfloat162
float22half2
<
__
nv
_bfloat162
>
(
float2
val
)
{
return
__float22bfloat162_rn
(
val
);
}
...
...
@@ -334,13 +334,13 @@ __device__ __forceinline__ static half2 h2div(half2 a, half2 b) {
of
.
y
=
__fdividef
(
af
.
y
,
bf
.
y
);
return
float22half2
<
half2
>
(
of
);
};
__device__
__forceinline__
static
__
hip
_bfloat162
h2div
(
__
hip
_bfloat162
a
,
__
hip
_bfloat162
b
)
{
__device__
__forceinline__
static
__
nv
_bfloat162
h2div
(
__
nv
_bfloat162
a
,
__
nv
_bfloat162
b
)
{
float2
af
=
half22float2
(
a
);
float2
bf
=
half22float2
(
b
);
float2
of
;
of
.
x
=
__fdividef
(
af
.
x
,
bf
.
x
);
of
.
y
=
__fdividef
(
af
.
y
,
bf
.
y
);
return
float22half2
<
__
hip
_bfloat162
>
(
of
);
return
float22half2
<
__
nv
_bfloat162
>
(
of
);
};
__device__
__forceinline__
static
void
reduce_add
(
float
*
addr
,
float
val
)
{
...
...
src/kernels/zgemm/gemm_w4a4.
hip
→
src/kernels/zgemm/gemm_w4a4.
cu
View file @
0a7c8614
File moved
src/kernels/zgemm/gemm_w4a4.cuh
View file @
0a7c8614
#include "hip/hip_runtime.h"
#pragma once
#include "gemm_base.cuh"
...
...
@@ -26,7 +25,7 @@ public:
// micro-scales for FP4 MMA
// each uint32_t is a 4*32 matrix of scales (for MMA of 64*32)
#if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ >= 1200
#if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ >= 1200
static
constexpr
bool
FP4_AVAILABLE
=
true
;
#else
static
constexpr
bool
FP4_AVAILABLE
=
false
;
...
...
@@ -624,7 +623,7 @@ public:
// each thread block (1 warp) quantize WARP_M * WARP_K tile (32 * 64)
struct
quantize_w4a4_act_kernel
{
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
__device__
void
operator
()(
const
half_t
*
input
,
packed_act_t
*
output
,
packed_ascale_t
*
oscales
,
int
K
)
{
const
int
laneId
=
threadIdx
.
x
%
WARP_SIZE
;
...
...
@@ -661,7 +660,7 @@ public:
// each thread block (1 warp) quantize WARP_N * WARP_K tile (128 * 64)
struct
quantize_w4a4_wgt_kernel
{
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
__device__
void
operator
()(
const
half_t
*
input
,
packed_wgt_t
*
output
,
packed_wscale_t
*
oscales
,
int
K
)
{
const
int
laneId
=
threadIdx
.
x
%
WARP_SIZE
;
...
...
@@ -722,9 +721,9 @@ public:
template
<
bool
ACT_UNSIGNED
,
typename
T
>
__device__
__forceinline__
static
void
compute
(
act_warp
A
,
wgt_warp
W
,
ascale_warp
ascale
,
wscale_warp
wscale
,
T
&
fpsum
)
{
#if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ == 800
#if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ == 800
using
int2half2
=
i2f_sm80
;
#elif defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ == 750
#elif defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ == 750
using
int2half2
=
std
::
conditional_t
<
Config
::
FASTER_I2F
,
i2f_sm75_fast
,
i2f_sm75
>
;
;
#else
...
...
@@ -902,7 +901,7 @@ public:
compute
<
ACT_UNSIGNED
>
(
A
[
k2
],
W
[
k2
],
ascale
[
k2
],
wscale
[
k2
],
fpsum
);
// #if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ >= 800
// #if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ >= 800
if
(
alwaysfalse
)
{
dummy
=
clock
();
}
...
...
@@ -1046,7 +1045,7 @@ public:
template
<
typename
Epilogue
,
bool
ACT_UNSIGNED
>
struct
gemm_w4a4_kernel
{
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MAX_ARCH
=
Config
::
FASTER_I2F
?
750
:
INT_MAX
;
// FASTER_I2F is only needed on sm_75
__device__
void
operator
()(
const
packed_act_t
*
act
,
...
...
@@ -1099,7 +1098,7 @@ public:
struct
quantize_w4a4_fuse_lora_kernel
{
using
oscales_t
=
typename
std
::
conditional_t
<
use_fp4
,
packed_amscale_t
,
packed_ascale_t
>
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
static
constexpr
size_t
SHMEM_PER_WARP
=
ceilDiv
<
size_t
>
(
Base
::
template
load_act_to_fpsum
<
fuse_glu
>
::
SHMEM_SIZE
,
128
)
*
128
;
static
constexpr
size_t
SHMEM_SIZE
=
SHMEM_PER_WARP
*
NUM_WARPS
;
...
...
src/kernels/zgemm/gemm_w4a4_launch_bf16_fp4.
hip
→
src/kernels/zgemm/gemm_w4a4_launch_bf16_fp4.
cu
View file @
0a7c8614
File moved
src/kernels/zgemm/gemm_w4a4_launch_bf16_int4.
hip
→
src/kernels/zgemm/gemm_w4a4_launch_bf16_int4.
cu
View file @
0a7c8614
File moved
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment