Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
fengzch-das
nunchaku
Commits
1a8114bf
Commit
1a8114bf
authored
Nov 21, 2025
by
fengzch-das
Browse files
hipify code
parent
c0177256
Pipeline
#3049
canceled with stages
Changes
50
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
141 additions
and
130 deletions
+141
-130
src/kernels/dwconv.hip
src/kernels/dwconv.hip
+4
-4
src/kernels/gemm_batched.hip
src/kernels/gemm_batched.hip
+0
-0
src/kernels/gemm_f16.hip
src/kernels/gemm_f16.hip
+0
-0
src/kernels/gemm_w8a8.hip
src/kernels/gemm_w8a8.hip
+0
-0
src/kernels/layernorm_kernels.h
src/kernels/layernorm_kernels.h
+1
-1
src/kernels/layernorm_kernels.hip
src/kernels/layernorm_kernels.hip
+22
-21
src/kernels/layernorm_kernels_impl.cuh
src/kernels/layernorm_kernels_impl.cuh
+2
-1
src/kernels/misc_kernels.hip
src/kernels/misc_kernels.hip
+30
-29
src/kernels/misc_kernels_impl.cuh
src/kernels/misc_kernels_impl.cuh
+3
-2
src/kernels/reduction_utils.cuh
src/kernels/reduction_utils.cuh
+1
-0
src/kernels/utils.cuh
src/kernels/utils.cuh
+42
-41
src/kernels/zgemm/attention.cuh
src/kernels/zgemm/attention.cuh
+5
-4
src/kernels/zgemm/attention.hip
src/kernels/zgemm/attention.hip
+4
-3
src/kernels/zgemm/epilogues.cuh
src/kernels/zgemm/epilogues.cuh
+3
-2
src/kernels/zgemm/gemm_base.cuh
src/kernels/zgemm/gemm_base.cuh
+11
-10
src/kernels/zgemm/gemm_utils.cuh
src/kernels/zgemm/gemm_utils.cuh
+4
-4
src/kernels/zgemm/gemm_w4a4.cuh
src/kernels/zgemm/gemm_w4a4.cuh
+9
-8
src/kernels/zgemm/gemm_w4a4.hip
src/kernels/zgemm/gemm_w4a4.hip
+0
-0
src/kernels/zgemm/gemm_w4a4_launch_bf16_fp4.hip
src/kernels/zgemm/gemm_w4a4_launch_bf16_fp4.hip
+0
-0
src/kernels/zgemm/gemm_w4a4_launch_bf16_int4.hip
src/kernels/zgemm/gemm_w4a4_launch_bf16_int4.hip
+0
-0
No files found.
src/kernels/dwconv.
cu
→
src/kernels/dwconv.
hip
View file @
1a8114bf
...
...
@@ -3,7 +3,7 @@
#include "dispatch_cutlass.h"
#include <
cuda
_runtime.h>
#include <
hip/hip
_runtime.h>
#include "cutlass/cutlass.h"
#include "cutlass/conv/device/direct_convolution.h"
...
...
@@ -74,7 +74,7 @@ static cutlass::Status depthwise_conv2d_kernel_run(cutlass::conv::Conv2dProblemS
UnderlyingKernel::ElementA *A, UnderlyingKernel::ElementB *B,
UnderlyingKernel::ElementC *C, UnderlyingKernel::ElementC *D,
ElementCompute alpha, ElementCompute beta, std::string split_k_mode,
cuda
Stream_t stream, int device_id = 0)
hip
Stream_t stream, int device_id = 0)
{
// create the tensor references
cutlass::Tensor4DCoord tensor_coord_A = cutlass::conv::implicit_gemm_tensor_a_extent(
...
...
@@ -183,7 +183,7 @@ Tensor depthwise_conv2d_kernel(Tensor A, Tensor B) {
Tensor D = Tensor::allocate({N, P, Q, K}, A.dtype(), A.device());
auto stream = getCurrent
CUDA
Stream();
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
cutlass::Status status = depthwise_conv2d_kernel_run(
&problem_size,
...
...
@@ -319,7 +319,7 @@ Tensor dwconv_f16(Tensor input, Tensor weight, Tensor out, Tensor bias) {
size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
BufferCUDA workspace(workspace_size);
auto
stream
=
getCurrent
CUDA
Stream
();
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
cutlass::Status status = implicit_gemm_op.can_implement(arguments);
if (status != cutlass::Status::kSuccess) {
...
...
src/kernels/gemm_batched.
cu
→
src/kernels/gemm_batched.
hip
View file @
1a8114bf
File moved
src/kernels/gemm_f16.
cu
→
src/kernels/gemm_f16.
hip
View file @
1a8114bf
File moved
src/kernels/gemm_w8a8.
cu
→
src/kernels/gemm_w8a8.
hip
View file @
1a8114bf
File moved
src/kernels/layernorm_kernels.h
View file @
1a8114bf
...
...
@@ -2,7 +2,7 @@
#include "common.h"
#include "Tensor.h"
#include <
cuda
_fp16.h>
#include <
hip/hip
_fp16.h>
void
rms_norm
(
Tensor
&
out
,
// [num_tokens, hidden_size]
Tensor
&
input
,
// [num_tokens, hidden_size]
...
...
src/kernels/layernorm_kernels.
cu
→
src/kernels/layernorm_kernels.
hip
View file @
1a8114bf
#include "hip/hip_runtime.h"
#include "layernorm_kernels_impl.cuh"
#include "dispatch_utils.h"
...
...
@@ -10,17 +11,17 @@ void rms_norm(Tensor &out, // [..., hidden_size]
int num_tokens = input.numel() / hidden_size;
dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024));
const
cuda
Stream_t
stream
=
getCurrent
CUDA
Stream
();
const
hip
Stream_t stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
if (use_quant) {
vllm
::
rms_norm_kernel
<
scalar_t
,
int8_t
,
true
>
<<<
grid
,
block
,
0
,
stream
>>>
(
out
.
data_ptr
<
int8_t
>
(),
hipLaunchKernelGGL((
vllm::rms_norm_kernel<scalar_t, int8_t, true>
), dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
out.data_ptr<int8_t>(),
input.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
epsilon,
num_tokens,
hidden_size);
} else {
vllm
::
rms_norm_kernel
<
scalar_t
,
scalar_t
,
false
>
<<<
grid
,
block
,
0
,
stream
>>>
(
out
.
data_ptr
<
scalar_t
>
(),
hipLaunchKernelGGL((
vllm::rms_norm_kernel<scalar_t, scalar_t, false>
), dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
out.data_ptr<scalar_t>(),
input.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
epsilon,
...
...
@@ -39,10 +40,10 @@ void layernorm_general(Tensor out, Tensor input, Tensor weight, Tensor bias, flo
size_t size_shmem = input.scalar_size() * hidden_size;
const
cuda
Stream_t
stream
=
getCurrent
CUDA
Stream
();
const
hip
Stream_t stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm", [&] {
using T = typename packed_as<scalar_t, 2>::type;
vllm
::
generalLayerNorm
<
T
,
half
,
true
>
<<<
grid
,
block
,
size_shmem
,
stream
>>>
(
hipLaunchKernelGGL((
vllm::generalLayerNorm<T, half, true>
), dim3(
grid
)
,
dim3(
block
)
, size_shmem, stream
,
reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
weight.valid() ? reinterpret_cast<T *>(weight.data_ptr<scalar_t>()) : nullptr,
bias.valid() ? reinterpret_cast<T *>(bias.data_ptr<scalar_t>()) : nullptr,
...
...
@@ -69,13 +70,13 @@ void rms_norm_general(Tensor &out, // [..., hidden_size]
dim3 block(std::min(hidden_size, 1024));
block.x = 32 * ((block.x + 31) / 32);
const
cuda
Stream_t
stream
=
getCurrent
CUDA
Stream
();
const
hip
Stream_t stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm", [&] {
using T = scalar_t;
if (use_per_token_quant) {
// per-token
vllm
::
generalLayerNorm
<
T
,
half
>
<<<
grid
,
block
,
0
,
stream
>>>
(
reinterpret_cast
<
T
*>
(
input
.
data_ptr
<
scalar_t
>
()),
hipLaunchKernelGGL((
vllm::generalLayerNorm<T, half>
)
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
nullptr,
nullptr,
...
...
@@ -92,8 +93,8 @@ void rms_norm_general(Tensor &out, // [..., hidden_size]
// weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
} else {
// per-tensor
vllm
::
generalLayerNorm
<
T
,
half
>
<<<
grid
,
block
,
0
,
stream
>>>
(
reinterpret_cast
<
T
*>
(
input
.
data_ptr
<
scalar_t
>
()),
hipLaunchKernelGGL((
vllm::generalLayerNorm<T, half>
)
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
nullptr,
nullptr,
...
...
@@ -121,13 +122,13 @@ void rms_norm_general_fuse_sum(Tensor &out, // [..., hidden_size]
dim3 block(std::min(hidden_size, 1024));
block.x = 32 * ((block.x + 31) / 32);
const
cuda
Stream_t
stream
=
getCurrent
CUDA
Stream
();
const
hip
Stream_t stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm_fuse_sum", [&] {
using T = scalar_t;
if (use_per_token_quant) {
// per-token
vllm
::
generalLayerNorm_fuse_sum
<
T
,
half
>
<<<
grid
,
block
,
0
,
stream
>>>
(
reinterpret_cast
<
T
*>
(
input
.
data_ptr
<
scalar_t
>
()),
hipLaunchKernelGGL((
vllm::generalLayerNorm_fuse_sum<T, half>
)
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
nullptr,
nullptr,
...
...
@@ -149,8 +150,8 @@ void rms_norm_general_fuse_sum(Tensor &out, // [..., hidden_size]
// Not implemented per-tensor input_sum
assert(false);
vllm
::
generalLayerNorm_fuse_sum
<
T
,
half
>
<<<
grid
,
block
,
0
,
stream
>>>
(
reinterpret_cast
<
T
*>
(
input
.
data_ptr
<
scalar_t
>
()),
hipLaunchKernelGGL((
vllm::generalLayerNorm_fuse_sum<T, half>
)
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
nullptr,
nullptr,
...
...
@@ -176,10 +177,10 @@ void invoke_dequant_add_residual_rms_norm_quant(Tensor &out, // [..., hidde
int num_tokens = input.numel() / hidden_size;
dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024));
const
cuda
Stream_t
stream
=
getCurrent
CUDA
Stream
();
const
hip
Stream_t stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
VLLM_DISPATCH_FLOATING_TYPES(residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel", [&] {
vllm
::
dequant_add_residual_rms_norm_quant_kernel
<
scalar_t
,
half
,
false
>
<<<
grid
,
block
,
0
,
stream
>>>
(
input
.
data_ptr
<
int32_t
>
(),
hipLaunchKernelGGL((
vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half, false>
)
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
input.data_ptr<int32_t>(),
residual.data_ptr<scalar_t>(),
out.data_ptr<int8_t>(),
gamma.data_ptr<scalar_t>(),
...
...
@@ -202,10 +203,10 @@ void invoke_dequant_add_residual_rms_norm_quant(Tensor &out, // [..., hidde
dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024));
const
cuda
Stream_t
stream
=
getCurrent
CUDA
Stream
();
const
hip
Stream_t stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
VLLM_DISPATCH_FLOATING_TYPES(residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel", [&] {
vllm
::
dequant_add_residual_rms_norm_quant_kernel
<
scalar_t
,
half
*
,
true
>
<<<
grid
,
block
,
0
,
stream
>>>
(
input
.
data_ptr
<
int32_t
>
(),
hipLaunchKernelGGL((
vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half *, true>
)
, dim3(
grid
)
,
dim3(
block
)
, 0, stream
,
input.data_ptr<int32_t>(),
residual.data_ptr<scalar_t>(),
out.data_ptr<int8_t>(),
gamma.data_ptr<scalar_t>(),
...
...
src/kernels/layernorm_kernels_impl.cuh
View file @
1a8114bf
#include <cuda_bf16.h>
#include "hip/hip_runtime.h"
#include <hip/hip_bf16.h>
#define ENABLE_BF16 1
...
...
src/kernels/misc_kernels.
cu
→
src/kernels/misc_kernels.
hip
View file @
1a8114bf
#include "hip/hip_runtime.h"
#include "misc_kernels_impl.cuh"
#include "misc_kernels.h"
#include "dispatch_utils.h"
...
...
@@ -13,12 +14,12 @@ Tensor add(Tensor a, Tensor b) {
int threadsPerBlock = 1024;
int blocksPerGrid = (a.numel() + threadsPerBlock - 1) / threadsPerBlock;
auto
stream
=
getCurrent
CUDA
Stream
();
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
Tensor out = Tensor::empty_like(a);
dispatch(out.scalar_type(), [&]<typename scalar_t>() {
add_kernel
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
add_kernel
), dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
a.data_ptr<scalar_t>(), b.data_ptr<scalar_t>(), out.data_ptr<scalar_t>(), out.numel());
});
...
...
@@ -46,12 +47,12 @@ void mul_add(Tensor x, Tensor scale, Tensor bias) {
int threadsPerBlock = 1024;
int blocksPerGrid = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll);
auto
stream
=
getCurrent
CUDA
Stream
();
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
dispatch(x.scalar_type(), [&]<typename scalar_t>() {
if (scale.valid()) {
mul_add_kernel
<
scalar_t
,
unroll
,
false
>
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
x
.
data_ptr
<
scalar_t
>
(),
hipLaunchKernelGGL((
mul_add_kernel<scalar_t, unroll, false>
)
, dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
x.data_ptr<scalar_t>(),
scale.data_ptr<scalar_t>(),
bias.data_ptr<scalar_t>(),
0,
...
...
@@ -62,7 +63,7 @@ void mul_add(Tensor x, Tensor scale, Tensor bias) {
0,
0);
} else {
mul_add_kernel
<
scalar_t
,
unroll
,
true
>
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
mul_add_kernel<scalar_t, unroll, true>
), dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
x.data_ptr<scalar_t>(), nullptr, bias.data_ptr<scalar_t>(), 0, x.numel(), 1, bias.numel(), 0, 0, 0);
}
});
...
...
@@ -96,12 +97,12 @@ void mul_add_batch(Tensor x, Tensor scale, bool batch_scale, double scale_shift,
int threadsPerBlock = 1024;
dim3 grid(ceilDiv(numel, threadsPerBlock * unroll), batch_size);
auto
stream
=
getCurrent
CUDA
Stream
();
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
dispatch(x.scalar_type(), [&]<typename scalar_t>() {
if (scale.valid()) {
mul_add_kernel
<
scalar_t
,
unroll
,
false
>
<<<
grid
,
threadsPerBlock
,
0
,
stream
>>>
(
x
.
data_ptr
<
scalar_t
>
(),
hipLaunchKernelGGL((
mul_add_kernel<scalar_t, unroll, false>
)
, dim3(
grid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
x.data_ptr<scalar_t>(),
scale.data_ptr<scalar_t>(),
bias.data_ptr<scalar_t>(),
(scalar_t)scale_shift,
...
...
@@ -112,8 +113,8 @@ void mul_add_batch(Tensor x, Tensor scale, bool batch_scale, double scale_shift,
batch_scale ? scale.stride(0) : 0,
batch_bias ? bias.stride(0) : 0);
} else {
mul_add_kernel
<
scalar_t
,
unroll
,
true
>
<<<
grid
,
threadsPerBlock
,
0
,
stream
>>>
(
x
.
data_ptr
<
scalar_t
>
(),
hipLaunchKernelGGL((
mul_add_kernel<scalar_t, unroll, true>
)
, dim3(
grid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
x.data_ptr<scalar_t>(),
nullptr,
bias.data_ptr<scalar_t>(),
(scalar_t)scale_shift,
...
...
@@ -134,12 +135,12 @@ Tensor embedding(Tensor input_id, Tensor lookup) {
auto shapeOut = input_id.shape;
shapeOut.dataExtent.push_back(lookup.shape[-1]);
auto
stream
=
getCurrent
CUDA
Stream
();
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
Tensor out = Tensor::empty(shapeOut, lookup.scalar_type(), input_id.device());
dispatch(out.scalar_type(), [&]<typename scalar_t>() {
EmbeddingKernel
<<<
input_id
.
numel
(),
std
::
min
(
lookup
.
shape
[
-
1
],
1024
),
0
,
stream
>>>
(
hipLaunchKernelGGL((
EmbeddingKernel
), dim3(
input_id.numel()
)
,
dim3(
std::min(lookup.shape[-1], 1024)
)
, 0, stream
,
input_id.data_ptr<int32_t>(), out.data_ptr<scalar_t>(), lookup.data_ptr<scalar_t>(), lookup.shape[-1]);
});
...
...
@@ -149,12 +150,12 @@ Tensor embedding(Tensor input_id, Tensor lookup) {
Tensor argmax_sample(Tensor logits) {
assert(logits.ndims() == 2);
auto
stream
=
getCurrent
CUDA
Stream
();
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
Tensor out = Tensor::empty({logits.shape[0]}, Tensor::INT32, logits.device());
dispatch(logits.scalar_type(), [&]<typename scalar_t>() {
argmax_sample_kernel
<<<
logits
.
shape
[
0
],
std
::
min
(
logits
.
shape
[
1
],
1024
),
0
,
stream
>>>
(
hipLaunchKernelGGL((
argmax_sample_kernel
), dim3(
logits.shape[0]
)
,
dim3(
std::min(logits.shape[1], 1024)
)
, 0, stream
,
logits.data_ptr<scalar_t>(), out.data_ptr<int32_t>(), logits.shape[1]);
});
...
...
@@ -167,7 +168,7 @@ void splitqkv(Tensor qkv, Tensor q, Tensor k, Tensor v) {
// assert(qkv.shape[0] == k.shape[0]);
// assert(qkv.shape[0] == v.shape[0]);
auto
stream
=
getCurrent
CUDA
Stream
();
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
int dim_q = q.shape[-1] * q.shape[-2];
int dim_k = k.shape[-1] * k.shape[-2];
...
...
@@ -179,7 +180,7 @@ void splitqkv(Tensor qkv, Tensor q, Tensor k, Tensor v) {
int num_tokens = qkv.numel() / qkv.shape[-1];
dispatch(qkv.scalar_type(), [&]<typename scalar_t>() {
splitqkv_kernel
<<<
num_tokens
,
std
::
min
(
qkv
.
shape
[
-
1
],
1024
),
0
,
stream
>>>
(
qkv
.
data_ptr
<
scalar_t
>
(),
hipLaunchKernelGGL((
splitqkv_kernel
), dim3(
num_tokens
)
,
dim3(
std::min(qkv.shape[-1], 1024)
)
, 0, stream
,
qkv.data_ptr<scalar_t>(),
q.data_ptr<scalar_t>(),
k.data_ptr<scalar_t>(),
v.data_ptr<scalar_t>(),
...
...
@@ -195,7 +196,7 @@ std::array<Tensor, N> split_mod(Tensor input) {
int threadsPerBlock = 1024;
int blocksPerGrid = (input.numel() + threadsPerBlock - 1) / threadsPerBlock;
auto
stream
=
getCurrent
CUDA
Stream
();
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
auto shapeOut = TensorShape(input.shape.dataExtent);
shapeOut[-1] /= N;
...
...
@@ -210,7 +211,7 @@ std::array<Tensor, N> split_mod(Tensor input) {
for (int k = 0; k < N; k++) {
outPtr[k] = out[k].template data_ptr<scalar_t>();
}
split_mod_kernel
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
split_mod_kernel
), dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
input.data_ptr<scalar_t>(), outPtr, input.numel());
});
...
...
@@ -227,10 +228,10 @@ Tensor quant_static(Tensor x, float scale) {
int threadsPerBlock = 1024;
int blocksPerGrid = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll);
auto
stream
=
getCurrent
CUDA
Stream
();
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
dispatch(x.scalar_type(), [&]<typename scalar_t>() {
quant_kernel_static
<
scalar_t
,
unroll
>
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
quant_kernel_static<scalar_t, unroll>
), dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
x.data_ptr<scalar_t>(), out.data_ptr<int8_t>(), (scalar_t)scale, x.numel());
});
...
...
@@ -247,10 +248,10 @@ Tensor quant_static_fuse_gelu(Tensor x, float scale) {
int threadsPerBlock = 1024;
int blocksPerGrid = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll);
auto
stream
=
getCurrent
CUDA
Stream
();
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
dispatch(x.scalar_type(), [&]<typename scalar_t>() {
quant_kernel_static_fuse_gelu
<
scalar_t
,
unroll
>
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
quant_kernel_static_fuse_gelu<scalar_t, unroll>
), dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
x.data_ptr<scalar_t>(), out.data_ptr<int8_t>(), (scalar_t)scale, x.numel());
});
...
...
@@ -266,7 +267,7 @@ void cast(Tensor input, Tensor output) {
assert(input.scalar_size() == output.scalar_size());
}
auto
stream
=
getCurrent
CUDA
Stream
();
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
dispatch(input.scalar_type(), [&]<typename input_t>() {
dispatch(output.scalar_type(), [&]<typename output_t>() {
...
...
@@ -275,10 +276,10 @@ void cast(Tensor input, Tensor output) {
int threadsPerBlock = 1024;
int blocksPerGrid = (int)ceilDiv<int64_t>(input.numel(), threadsPerBlock * unroll);
cast_kernel
<
input_t
,
output_t
,
unroll
>
<<<
blocksPerGrid
,
threadsPerBlock
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
cast_kernel<input_t, output_t, unroll>
), dim3(
blocksPerGrid
)
,
dim3(
threadsPerBlock
)
, 0, stream
,
input.data_ptr<input_t>(), output.data_ptr<output_t>(), input.numel());
checkCUDA
(
cuda
GetLastError
());
checkCUDA(
hip
GetLastError());
});
});
}
...
...
@@ -298,7 +299,7 @@ Tensor topk(Tensor x, int k) {
Tensor out = Tensor::empty(outShape, Tensor::INT32, x.device());
auto
stream
=
getCurrent
CUDA
Stream
();
auto stream = getCurrent
HIP
Stream
MasqueradingAsCUDA
();
dispatchVal(k, std::make_integer_sequence<int, MAXK + 1>(), [&]<int K>() {
if constexpr (K == 0) {
...
...
@@ -307,9 +308,9 @@ Tensor topk(Tensor x, int k) {
}
if constexpr (K > 0) {
dispatch(x.scalar_type(), [&]<typename scalar_t>() {
topk_kernel
<
scalar_t
,
K
>
<<<
ceilDiv
(
batch
,
32
),
32
,
0
,
stream
>>>
(
hipLaunchKernelGGL((
topk_kernel<scalar_t, K>
), dim3(
ceilDiv(batch, 32)
)
,
dim3(
32
)
, 0, stream
,
x.data_ptr<scalar_t>(), out.data_ptr<int>(), N, x.stride(-2), batch);
checkCUDA
(
cuda
GetLastError
());
checkCUDA(
hip
GetLastError());
});
}
});
...
...
src/kernels/misc_kernels_impl.cuh
View file @
1a8114bf
#include "hip/hip_runtime.h"
#include "reduction_utils.cuh"
#include <array>
#include <
cuda
_fp16.h>
#include <
cuda
_bf16.h>
#include <
hip/hip
_fp16.h>
#include <
hip/hip
_bf16.h>
#include "utils.cuh"
#include "activation_kernels_impl.cuh"
...
...
src/kernels/reduction_utils.cuh
View file @
1a8114bf
#include "hip/hip_runtime.h"
/*
* Adapted from
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh
...
...
src/kernels/utils.cuh
View file @
1a8114bf
#include "hip/hip_runtime.h"
// Adated from FasterTransformer,
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
#pragma once
...
...
@@ -9,10 +10,10 @@
#include <cstdio>
#include <
cuda
_fp16.h>
#include <
hip/hip
_fp16.h>
#ifdef ENABLE_BF16
#include <
cuda
_bf16.h>
#include <
hip/hip
_bf16.h>
#endif
__device__
__forceinline__
static
void
trap_unsupported_arch
()
{
...
...
@@ -24,11 +25,11 @@ __device__ __forceinline__ static void trap_unsupported_arch() {
__trap
();
}
#if defined(ENABLE_BF16) && defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ < 800
__device__
__forceinline__
static
__
nv
_bfloat162
__hfma2
(
const
__
nv
_bfloat162
a
,
const
__
nv
_bfloat162
b
,
const
__
nv
_bfloat162
c
)
{
#if defined(ENABLE_BF16) && defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ < 800
__device__
__forceinline__
static
__
hip
_bfloat162
__hfma2
(
const
__
hip
_bfloat162
a
,
const
__
hip
_bfloat162
b
,
const
__
hip
_bfloat162
c
)
{
trap_unsupported_arch
();
return
__
nv
_bfloat162
(
0.0
f
,
0.0
f
);
return
__
hip
_bfloat162
(
0.0
f
,
0.0
f
);
}
#endif
...
...
@@ -56,11 +57,11 @@ struct num_elems<half2> {
};
#ifdef ENABLE_BF16
template
<
>
struct
num_elems
<
__
nv
_bfloat16
>
{
struct
num_elems
<
__
hip
_bfloat16
>
{
static
constexpr
int
value
=
1
;
};
template
<
>
struct
num_elems
<
__
nv
_bfloat162
>
{
struct
num_elems
<
__
hip
_bfloat162
>
{
static
constexpr
int
value
=
2
;
};
#endif
...
...
@@ -107,12 +108,12 @@ struct packed_as<float2, 1> {
};
#ifdef ENABLE_BF16
template
<
>
struct
packed_as
<
__
nv
_bfloat16
,
2
>
{
using
type
=
__
nv
_bfloat162
;
struct
packed_as
<
__
hip
_bfloat16
,
2
>
{
using
type
=
__
hip
_bfloat162
;
};
template
<
>
struct
packed_as
<
__
nv
_bfloat162
,
1
>
{
using
type
=
__
nv
_bfloat16
;
struct
packed_as
<
__
hip
_bfloat162
,
1
>
{
using
type
=
__
hip
_bfloat16
;
};
#endif
#ifdef ENABLE_FP8
...
...
@@ -169,8 +170,8 @@ inline __device__ T ldg(const T *val) {
#define bf1622float2 __bfloat1622float2
#define float22bf162 __float22bfloat162_rn
#define bf162bf162 __bfloat162bfloat162
inline
__device__
int16_t
bf1622int16
(
__
nv
_bfloat162
val
)
{
#if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ < 800
inline
__device__
int16_t
bf1622int16
(
__
hip
_bfloat162
val
)
{
#if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ < 800
float2
f_val
;
f_val
.
x
=
max
(
min
(
__low2float
(
val
),
127.
f
),
-
128.
f
);
f_val
.
y
=
max
(
min
(
__high2float
(
val
),
127.
f
),
-
128.
f
);
...
...
@@ -201,8 +202,8 @@ inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
#if ENABLE_BF16
template
<
>
inline
__device__
__
nv
_bfloat162
ldg
(
const
__
nv
_bfloat162
*
val
)
{
#if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ < 800
inline
__device__
__
hip
_bfloat162
ldg
(
const
__
hip
_bfloat162
*
val
)
{
#if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ < 800
return
val
[
0
];
#else
return
__ldg
(
val
);
...
...
@@ -210,8 +211,8 @@ inline __device__ __nv_bfloat162 ldg(const __nv_bfloat162 *val) {
}
template
<
>
inline
__device__
__
nv
_bfloat16
ldg
(
const
__
nv
_bfloat16
*
val
)
{
#if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ < 800
inline
__device__
__
hip
_bfloat16
ldg
(
const
__
hip
_bfloat16
*
val
)
{
#if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ < 800
return
val
[
0
];
#else
return
__ldg
(
val
);
...
...
@@ -330,81 +331,81 @@ __device__ inline float2 cuda_cast<float2, int16_t>(int16_t val) {
#ifdef ENABLE_BF16
template
<
>
__device__
inline
__
nv
_bfloat16
cuda_cast
(
int32_t
val
)
{
__device__
inline
__
hip
_bfloat16
cuda_cast
(
int32_t
val
)
{
return
static_cast
<
float
>
(
val
);
}
template
<
>
__device__
inline
__
nv
_bfloat16
cuda_cast
(
int8_t
val
)
{
__device__
inline
__
hip
_bfloat16
cuda_cast
(
int8_t
val
)
{
return
static_cast
<
float
>
(
val
);
}
template
<
>
__device__
inline
int8_t
cuda_cast
(
__
nv
_bfloat16
val
)
{
__device__
inline
int8_t
cuda_cast
(
__
hip
_bfloat16
val
)
{
return
static_cast
<
float
>
(
val
);
}
template
<
>
__device__
inline
float
cuda_cast
<
float
,
__
nv
_bfloat16
>
(
__
nv
_bfloat16
val
)
{
__device__
inline
float
cuda_cast
<
float
,
__
hip
_bfloat16
>
(
__
hip
_bfloat16
val
)
{
return
__bfloat162float
(
val
);
}
template
<
>
__device__
inline
float2
cuda_cast
<
float2
,
__
nv
_bfloat162
>
(
__
nv
_bfloat162
val
)
{
__device__
inline
float2
cuda_cast
<
float2
,
__
hip
_bfloat162
>
(
__
hip
_bfloat162
val
)
{
return
bf1622float2
(
val
);
}
template
<
>
__device__
inline
half
cuda_cast
<
half
,
__
nv
_bfloat16
>
(
__
nv
_bfloat16
val
)
{
__device__
inline
half
cuda_cast
<
half
,
__
hip
_bfloat16
>
(
__
hip
_bfloat16
val
)
{
return
__float2half
(
__bfloat162float
(
val
));
}
template
<
>
__device__
inline
int16_t
cuda_cast
<
int16_t
,
__
nv
_bfloat162
>
(
__
nv
_bfloat162
val
)
{
__device__
inline
int16_t
cuda_cast
<
int16_t
,
__
hip
_bfloat162
>
(
__
hip
_bfloat162
val
)
{
return
bf1622int16
(
val
);
}
template
<
>
__device__
inline
__
nv
_bfloat16
cuda_cast
<
__
nv
_bfloat16
,
float
>
(
float
val
)
{
__device__
inline
__
hip
_bfloat16
cuda_cast
<
__
hip
_bfloat16
,
float
>
(
float
val
)
{
return
__float2bfloat16
(
val
);
}
template
<
>
__device__
inline
__
nv
_bfloat16
cuda_cast
<
__
nv
_bfloat16
,
half
>
(
half
val
)
{
__device__
inline
__
hip
_bfloat16
cuda_cast
<
__
hip
_bfloat16
,
half
>
(
half
val
)
{
return
__float2bfloat16
(
__half2float
(
val
));
}
template
<
>
__device__
inline
__
nv
_bfloat162
cuda_cast
<
__
nv
_bfloat162
,
__
nv
_bfloat16
>
(
__
nv
_bfloat16
val
)
{
__device__
inline
__
hip
_bfloat162
cuda_cast
<
__
hip
_bfloat162
,
__
hip
_bfloat16
>
(
__
hip
_bfloat16
val
)
{
return
bf162bf162
(
val
);
}
template
<
>
__device__
inline
__
nv
_bfloat162
cuda_cast
<
__
nv
_bfloat162
,
float
>
(
float
val
)
{
__device__
inline
__
hip
_bfloat162
cuda_cast
<
__
hip
_bfloat162
,
float
>
(
float
val
)
{
return
__float2bfloat162_rn
(
val
);
}
template
<
>
__device__
inline
__
nv
_bfloat162
cuda_cast
<
__
nv
_bfloat162
,
float2
>
(
float2
val
)
{
__device__
inline
__
hip
_bfloat162
cuda_cast
<
__
hip
_bfloat162
,
float2
>
(
float2
val
)
{
return
float22bf162
(
val
);
}
template
<
>
__device__
inline
__
nv
_bfloat162
cuda_cast
<
__
nv
_bfloat162
,
int16_t
>
(
int16_t
val
)
{
__device__
inline
__
hip
_bfloat162
cuda_cast
<
__
hip
_bfloat162
,
int16_t
>
(
int16_t
val
)
{
union
{
int8_t
int8
[
2
];
int16_t
int16
;
};
int16
=
val
;
__
nv
_bfloat162
res
;
res
.
x
=
cuda_cast
<
__
nv
_bfloat16
>
(
int8
[
0
]);
res
.
y
=
cuda_cast
<
__
nv
_bfloat16
>
(
int8
[
1
]);
__
hip
_bfloat162
res
;
res
.
x
=
cuda_cast
<
__
hip
_bfloat16
>
(
int8
[
0
]);
res
.
y
=
cuda_cast
<
__
hip
_bfloat16
>
(
int8
[
1
]);
return
res
;
}
template
<
>
__device__
inline
__
nv
_bfloat162
cuda_cast
<
__
nv
_bfloat162
,
half2
>
(
half2
val
)
{
__device__
inline
__
hip
_bfloat162
cuda_cast
<
__
hip
_bfloat162
,
half2
>
(
half2
val
)
{
return
float22bf162
(
__half22float2
(
val
));
}
...
...
@@ -420,7 +421,7 @@ __device__ __forceinline__ packed_as<half, 2>::type f162f162<half>(half x) {
#ifdef ENABLE_BF16
template
<
>
__device__
__forceinline__
packed_as
<
__
nv
_bfloat16
,
2
>::
type
f162f162
<
__
nv
_bfloat16
>
(
__
nv
_bfloat16
x
)
{
__device__
__forceinline__
packed_as
<
__
hip
_bfloat16
,
2
>::
type
f162f162
<
__
hip
_bfloat16
>
(
__
hip
_bfloat16
x
)
{
return
__bfloat162bfloat162
(
x
);
}
#endif
...
...
@@ -453,8 +454,8 @@ __device__ inline half cuda_max(half2 val) {
#ifdef ENABLE_BF16
template
<
>
__device__
inline
__
nv
_bfloat16
cuda_max
(
__
nv
_bfloat162
val
)
{
#if (defined(__
CUDA
_ARCH__) && (__
CUDA
_ARCH__ >= 800))
__device__
inline
__
hip
_bfloat16
cuda_max
(
__
hip
_bfloat162
val
)
{
#if (defined(__
DTK
_ARCH__) && (__
DTK
_ARCH__ >= 800))
return
__hmax
(
val
.
x
,
val
.
y
);
#else
assert
(
false
);
...
...
@@ -497,14 +498,14 @@ __device__ inline half2 cuda_abs(half2 val) {
#ifdef ENABLE_BF16
#if __
CUDA
_ARCH__ >= 800 || !defined(__
CUDA
_ARCH__)
#if __
DTK
_ARCH__ >= 800 || !defined(__
DTK
_ARCH__)
template
<
>
__device__
inline
__
nv
_bfloat16
cuda_abs
(
__
nv
_bfloat16
val
)
{
__device__
inline
__
hip
_bfloat16
cuda_abs
(
__
hip
_bfloat16
val
)
{
return
__habs
(
val
);
}
template
<
>
__device__
inline
__
nv
_bfloat162
cuda_abs
(
__
nv
_bfloat162
val
)
{
__device__
inline
__
hip
_bfloat162
cuda_abs
(
__
hip
_bfloat162
val
)
{
return
__habs2
(
val
);
}
#endif
...
...
src/kernels/zgemm/attention.cuh
View file @
1a8114bf
#include "hip/hip_runtime.h"
#pragma once
#include "gemm_base.cuh"
...
...
@@ -26,8 +27,8 @@ struct AttentionFP16Config {
using
half_t
=
half
;
using
half2_t
=
half2
;
using
epilogue_half_t
=
typename
std
::
conditional_t
<
bf16out
,
__
nv
_bfloat16
,
half
>
;
using
epilogue_half2_t
=
typename
std
::
conditional_t
<
bf16out
,
__
nv
_bfloat162
,
half2
>
;
using
epilogue_half_t
=
typename
std
::
conditional_t
<
bf16out
,
__
hip
_bfloat16
,
half
>
;
using
epilogue_half2_t
=
typename
std
::
conditional_t
<
bf16out
,
__
hip
_bfloat162
,
half2
>
;
};
using
AttentionFP16Config_FP16
=
AttentionFP16Config
<
false
>
;
...
...
@@ -60,7 +61,7 @@ public:
using
typename
AttentionConfig
::
epilogue_half_t
;
using
typename
AttentionConfig
::
epilogue_half2_t
;
#if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ >= 800
#if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ >= 800
static
constexpr
bool
IS_SM80
=
true
;
#else
static
constexpr
bool
IS_SM80
=
false
;
...
...
@@ -657,7 +658,7 @@ public:
template
<
typename
Epilogue
>
struct
attention_fp16_kernel
{
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
static
constexpr
int
SHMEM_SIZE
=
0
;
// sizeof(q_shmem_t);
__device__
void
operator
()(
const
packed_q_t
*
ptr_q
,
...
...
src/kernels/zgemm/attention.
cu
→
src/kernels/zgemm/attention.
hip
View file @
1a8114bf
#include "hip/hip_runtime.h"
#include "zgemm.h"
#include "attention.cuh"
...
...
@@ -71,10 +72,10 @@ void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM]
shmem = std::max(shmem, Attention::template attention_fp16_kernel<Epilogue>::SHMEM_SIZE);
if (shmem >= 24 * 1024) {
checkCUDA
(
cuda
FuncSetAttribute
(
func
,
cuda
FuncAttributeMaxDynamicSharedMemorySize
,
shmem
));
checkCUDA(
hip
FuncSetAttribute(func,
hip
FuncAttributeMaxDynamicSharedMemorySize, shmem));
}
func
<<<
grid
,
GEMM
::
WARP_SIZE
*
GEMM
::
NUM_WARPS
,
shmem
,
getCurrent
CUDA
Stream
()
>>>
(
q
.
data_ptr
<
packed_q_t
>
(),
hipLaunchKernelGGL(( func), dim3(
grid
)
,
dim3(
GEMM::WARP_SIZE * GEMM::NUM_WARPS
)
, shmem, getCurrent
HIP
Stream
MasqueradingAsCUDA(),
q.data_ptr<packed_q_t>(),
k.data_ptr<packed_k_t>(),
v.data_ptr<packed_v_t>(),
scale,
...
...
@@ -82,7 +83,7 @@ void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM]
numTokensKV,
args,
false);
checkCUDA
(
cuda
GetLastError
());
checkCUDA(
hip
GetLastError());
};
launch.template operator()<typename GEMM::EpilogueDefault>(typename GEMM::EpilogueDefault::Arguments{
...
...
src/kernels/zgemm/epilogues.cuh
View file @
1a8114bf
#include "hip/hip_runtime.h"
#pragma once
#include "gemm_base.cuh"
...
...
@@ -702,7 +703,7 @@ public:
// q: [batch_size, #blocks, block_size, #heads, HEAD_DIM]
// vk: [batch_size, #heads, HEAD_DIM+1, HEAD_DIM]
struct
vk_mul_q_kernel
{
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
// FIXME FIXME FIXME
__device__
void
operator
()(
half_t
*
q
,
const
float
*
vk
,
float
eps
,
int
num_tokens
)
{
const
int
block_id
=
blockIdx
.
x
;
...
...
@@ -762,7 +763,7 @@ public:
template
<
typename
Epilogue
>
struct
test_epilogue_kernel
{
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
static
constexpr
size_t
SHMEM_PER_WARP
=
ceilDiv
<
size_t
>
(
Base
::
template
load_act_to_fpsum
<
false
>
::
SHMEM_SIZE
,
128
)
*
128
;
static
constexpr
size_t
SHMEM_SIZE
=
SHMEM_PER_WARP
*
NUM_WARPS
;
...
...
src/kernels/zgemm/gemm_base.cuh
View file @
1a8114bf
#include "hip/hip_runtime.h"
#pragma once
#include "common.h"
...
...
@@ -44,8 +45,8 @@ public:
// may generate incorrect results in certain circumstances
static
constexpr
bool
FASTER_I2F
=
faster_i2f
;
using
half_t
=
typename
std
::
conditional_t
<
bf16
,
__
nv
_bfloat16
,
half
>
;
using
half2_t
=
typename
std
::
conditional_t
<
bf16
,
__
nv
_bfloat162
,
half2
>
;
using
half_t
=
typename
std
::
conditional_t
<
bf16
,
__
hip
_bfloat16
,
half
>
;
using
half2_t
=
typename
std
::
conditional_t
<
bf16
,
__
hip
_bfloat162
,
half2
>
;
};
using
GEMMConfig_W4A4_FP16
=
GEMMConfig_W4A4
<
false
>
;
...
...
@@ -67,8 +68,8 @@ public:
using half_t = half;
using half2_t = half2;
#else
using
half_t
=
__
nv
_bfloat16
;
using
half2_t
=
__
nv
_bfloat162
;
using
half_t
=
__
hip
_bfloat16
;
using
half2_t
=
__
hip
_bfloat162
;
#endif
};
...
...
@@ -202,9 +203,9 @@ public:
__device__
__forceinline__
static
packed_f32psum_t
mma_f16xf16_f32
(
packed_fpsum_t
a
,
packed_fpsum_t
b
,
packed_f32psum_t
psum
)
{
static_assert
(
std
::
is_same_v
<
half_t
,
half
>
||
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
);
static_assert
(
std
::
is_same_v
<
half_t
,
half
>
||
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
);
static
constexpr
bool
is_bf16
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
;
static
constexpr
bool
is_bf16
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
;
uint4
out1
=
mma_m16n8k16_f32f16f16f32
<
is_bf16
>
(
kernels
::
bit_cast
<
uint4
>
(
a
),
...
...
@@ -890,8 +891,8 @@ constexpr int max_arch() {
template
<
typename
kernel
,
typename
...
T
>
__global__
static
void
invoke_kernel
(
T
...
args
)
{
#ifdef __
CUDA
_ARCH__
if
constexpr
(
__
CUDA
_ARCH__
>=
min_arch
<
kernel
>
()
&&
__
CUDA
_ARCH__
<=
max_arch
<
kernel
>
())
{
#ifdef __
DTK
_ARCH__
if
constexpr
(
__
DTK
_ARCH__
>=
min_arch
<
kernel
>
()
&&
__
DTK
_ARCH__
<=
max_arch
<
kernel
>
())
{
kernel
()(
args
...);
}
else
{
trap_unsupported_arch
();
...
...
@@ -916,8 +917,8 @@ template<typename T>
static
void
test_sizeof
()
{
printf
(
"typeid = %s
\n
"
,
typeid
(
T
).
name
());
test_sizeof_host
<
T
>
();
test_sizeof_device
<
T
>
<<<
1
,
1
>>>
(
);
checkCUDA
(
cuda
DeviceSynchronize
());
hipLaunchKernelGGL
((
test_sizeof_device
<
T
>
),
dim3
(
1
),
dim3
(
1
),
0
,
0
,
);
checkCUDA
(
hip
DeviceSynchronize
());
}
};
// namespace nunchaku::kernels
src/kernels/zgemm/gemm_utils.cuh
View file @
1a8114bf
...
...
@@ -163,7 +163,7 @@ __device__ __forceinline__ static float2 half22float2(half2 val) {
return
__half22float2
(
val
);
}
__device__
__forceinline__
static
float2
half22float2
(
__
nv
_bfloat162
val
)
{
__device__
__forceinline__
static
float2
half22float2
(
__
hip
_bfloat162
val
)
{
return
__bfloat1622float2
(
val
);
}
...
...
@@ -176,7 +176,7 @@ __device__ __forceinline__ half2 float22half2<half2>(float2 val) {
}
template
<
>
__device__
__forceinline__
__
nv
_bfloat162
float22half2
<
__
nv
_bfloat162
>
(
float2
val
)
{
__device__
__forceinline__
__
hip
_bfloat162
float22half2
<
__
hip
_bfloat162
>
(
float2
val
)
{
return
__float22bfloat162_rn
(
val
);
}
...
...
@@ -334,13 +334,13 @@ __device__ __forceinline__ static half2 h2div(half2 a, half2 b) {
of
.
y
=
__fdividef
(
af
.
y
,
bf
.
y
);
return
float22half2
<
half2
>
(
of
);
};
__device__
__forceinline__
static
__
nv
_bfloat162
h2div
(
__
nv
_bfloat162
a
,
__
nv
_bfloat162
b
)
{
__device__
__forceinline__
static
__
hip
_bfloat162
h2div
(
__
hip
_bfloat162
a
,
__
hip
_bfloat162
b
)
{
float2
af
=
half22float2
(
a
);
float2
bf
=
half22float2
(
b
);
float2
of
;
of
.
x
=
__fdividef
(
af
.
x
,
bf
.
x
);
of
.
y
=
__fdividef
(
af
.
y
,
bf
.
y
);
return
float22half2
<
__
nv
_bfloat162
>
(
of
);
return
float22half2
<
__
hip
_bfloat162
>
(
of
);
};
__device__
__forceinline__
static
void
reduce_add
(
float
*
addr
,
float
val
)
{
...
...
src/kernels/zgemm/gemm_w4a4.cuh
View file @
1a8114bf
#include "hip/hip_runtime.h"
#pragma once
#include "gemm_base.cuh"
...
...
@@ -25,7 +26,7 @@ public:
// micro-scales for FP4 MMA
// each uint32_t is a 4*32 matrix of scales (for MMA of 64*32)
#if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ >= 1200
#if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ >= 1200
static
constexpr
bool
FP4_AVAILABLE
=
true
;
#else
static
constexpr
bool
FP4_AVAILABLE
=
false
;
...
...
@@ -623,7 +624,7 @@ public:
// each thread block (1 warp) quantize WARP_M * WARP_K tile (32 * 64)
struct
quantize_w4a4_act_kernel
{
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
__device__
void
operator
()(
const
half_t
*
input
,
packed_act_t
*
output
,
packed_ascale_t
*
oscales
,
int
K
)
{
const
int
laneId
=
threadIdx
.
x
%
WARP_SIZE
;
...
...
@@ -660,7 +661,7 @@ public:
// each thread block (1 warp) quantize WARP_N * WARP_K tile (128 * 64)
struct
quantize_w4a4_wgt_kernel
{
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
__device__
void
operator
()(
const
half_t
*
input
,
packed_wgt_t
*
output
,
packed_wscale_t
*
oscales
,
int
K
)
{
const
int
laneId
=
threadIdx
.
x
%
WARP_SIZE
;
...
...
@@ -721,9 +722,9 @@ public:
template
<
bool
ACT_UNSIGNED
,
typename
T
>
__device__
__forceinline__
static
void
compute
(
act_warp
A
,
wgt_warp
W
,
ascale_warp
ascale
,
wscale_warp
wscale
,
T
&
fpsum
)
{
#if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ == 800
#if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ == 800
using
int2half2
=
i2f_sm80
;
#elif defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ == 750
#elif defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ == 750
using
int2half2
=
std
::
conditional_t
<
Config
::
FASTER_I2F
,
i2f_sm75_fast
,
i2f_sm75
>
;
;
#else
...
...
@@ -901,7 +902,7 @@ public:
compute
<
ACT_UNSIGNED
>
(
A
[
k2
],
W
[
k2
],
ascale
[
k2
],
wscale
[
k2
],
fpsum
);
// #if defined(__
CUDA
_ARCH__) && __
CUDA
_ARCH__ >= 800
// #if defined(__
DTK
_ARCH__) && __
DTK
_ARCH__ >= 800
if
(
alwaysfalse
)
{
dummy
=
clock
();
}
...
...
@@ -1045,7 +1046,7 @@ public:
template
<
typename
Epilogue
,
bool
ACT_UNSIGNED
>
struct
gemm_w4a4_kernel
{
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MAX_ARCH
=
Config
::
FASTER_I2F
?
750
:
INT_MAX
;
// FASTER_I2F is only needed on sm_75
__device__
void
operator
()(
const
packed_act_t
*
act
,
...
...
@@ -1098,7 +1099,7 @@ public:
struct
quantize_w4a4_fuse_lora_kernel
{
using
oscales_t
=
typename
std
::
conditional_t
<
use_fp4
,
packed_amscale_t
,
packed_ascale_t
>
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
nv
_bfloat16
>
?
800
:
750
;
static
constexpr
int
MIN_ARCH
=
std
::
is_same_v
<
half_t
,
__
hip
_bfloat16
>
?
800
:
750
;
static
constexpr
size_t
SHMEM_PER_WARP
=
ceilDiv
<
size_t
>
(
Base
::
template
load_act_to_fpsum
<
fuse_glu
>
::
SHMEM_SIZE
,
128
)
*
128
;
static
constexpr
size_t
SHMEM_SIZE
=
SHMEM_PER_WARP
*
NUM_WARPS
;
...
...
src/kernels/zgemm/gemm_w4a4.
cu
→
src/kernels/zgemm/gemm_w4a4.
hip
View file @
1a8114bf
File moved
src/kernels/zgemm/gemm_w4a4_launch_bf16_fp4.
cu
→
src/kernels/zgemm/gemm_w4a4_launch_bf16_fp4.
hip
View file @
1a8114bf
File moved
src/kernels/zgemm/gemm_w4a4_launch_bf16_int4.
cu
→
src/kernels/zgemm/gemm_w4a4_launch_bf16_int4.
hip
View file @
1a8114bf
File moved
Prev
1
2
3
Next
fengzch-das
@Fzc7075
mentioned in commit
0a7c8614
·
Nov 21, 2025
mentioned in commit
0a7c8614
mentioned in commit 0a7c8614d9f6cd854550424762fe5d1d9372cefa
Toggle commit list
fengzch-das
@Fzc7075
mentioned in merge request
!1 (merged)
·
Nov 21, 2025
mentioned in merge request
!1 (merged)
mentioned in merge request !1
Toggle commit list
fengzch-das
@Fzc7075
mentioned in commit
7c282e2e
·
Nov 21, 2025
mentioned in commit
7c282e2e
mentioned in commit 7c282e2e9f2302b856ced94c34b867fb45ad291d
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment