Commit 1a8114bf authored by fengzch-das's avatar fengzch-das
Browse files

hipify code

parent c0177256
Pipeline #3049 canceled with stages
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "dispatch_cutlass.h" #include "dispatch_cutlass.h"
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#include "cutlass/cutlass.h" #include "cutlass/cutlass.h"
#include "cutlass/conv/device/direct_convolution.h" #include "cutlass/conv/device/direct_convolution.h"
...@@ -74,7 +74,7 @@ static cutlass::Status depthwise_conv2d_kernel_run(cutlass::conv::Conv2dProblemS ...@@ -74,7 +74,7 @@ static cutlass::Status depthwise_conv2d_kernel_run(cutlass::conv::Conv2dProblemS
UnderlyingKernel::ElementA *A, UnderlyingKernel::ElementB *B, UnderlyingKernel::ElementA *A, UnderlyingKernel::ElementB *B,
UnderlyingKernel::ElementC *C, UnderlyingKernel::ElementC *D, UnderlyingKernel::ElementC *C, UnderlyingKernel::ElementC *D,
ElementCompute alpha, ElementCompute beta, std::string split_k_mode, ElementCompute alpha, ElementCompute beta, std::string split_k_mode,
cudaStream_t stream, int device_id = 0) hipStream_t stream, int device_id = 0)
{ {
// create the tensor references // create the tensor references
cutlass::Tensor4DCoord tensor_coord_A = cutlass::conv::implicit_gemm_tensor_a_extent( cutlass::Tensor4DCoord tensor_coord_A = cutlass::conv::implicit_gemm_tensor_a_extent(
...@@ -183,7 +183,7 @@ Tensor depthwise_conv2d_kernel(Tensor A, Tensor B) { ...@@ -183,7 +183,7 @@ Tensor depthwise_conv2d_kernel(Tensor A, Tensor B) {
Tensor D = Tensor::allocate({N, P, Q, K}, A.dtype(), A.device()); Tensor D = Tensor::allocate({N, P, Q, K}, A.dtype(), A.device());
auto stream = getCurrentCUDAStream(); auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
cutlass::Status status = depthwise_conv2d_kernel_run( cutlass::Status status = depthwise_conv2d_kernel_run(
&problem_size, &problem_size,
...@@ -319,7 +319,7 @@ Tensor dwconv_f16(Tensor input, Tensor weight, Tensor out, Tensor bias) { ...@@ -319,7 +319,7 @@ Tensor dwconv_f16(Tensor input, Tensor weight, Tensor out, Tensor bias) {
size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments); size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
BufferCUDA workspace(workspace_size); BufferCUDA workspace(workspace_size);
auto stream = getCurrentCUDAStream(); auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
cutlass::Status status = implicit_gemm_op.can_implement(arguments); cutlass::Status status = implicit_gemm_op.can_implement(arguments);
if (status != cutlass::Status::kSuccess) { if (status != cutlass::Status::kSuccess) {
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#include "common.h" #include "common.h"
#include "Tensor.h" #include "Tensor.h"
#include <cuda_fp16.h> #include <hip/hip_fp16.h>
void rms_norm(Tensor &out, // [num_tokens, hidden_size] void rms_norm(Tensor &out, // [num_tokens, hidden_size]
Tensor &input, // [num_tokens, hidden_size] Tensor &input, // [num_tokens, hidden_size]
......
#include "hip/hip_runtime.h"
#include "layernorm_kernels_impl.cuh" #include "layernorm_kernels_impl.cuh"
#include "dispatch_utils.h" #include "dispatch_utils.h"
...@@ -10,17 +11,17 @@ void rms_norm(Tensor &out, // [..., hidden_size] ...@@ -10,17 +11,17 @@ void rms_norm(Tensor &out, // [..., hidden_size]
int num_tokens = input.numel() / hidden_size; int num_tokens = input.numel() / hidden_size;
dim3 grid(num_tokens); dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024)); dim3 block(std::min(hidden_size, 1024));
const cudaStream_t stream = getCurrentCUDAStream(); const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] { VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
if (use_quant) { if (use_quant) {
vllm::rms_norm_kernel<scalar_t, int8_t, true><<<grid, block, 0, stream>>>(out.data_ptr<int8_t>(), hipLaunchKernelGGL(( vllm::rms_norm_kernel<scalar_t, int8_t, true>), dim3(grid), dim3(block), 0, stream, out.data_ptr<int8_t>(),
input.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
epsilon, epsilon,
num_tokens, num_tokens,
hidden_size); hidden_size);
} else { } else {
vllm::rms_norm_kernel<scalar_t, scalar_t, false><<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), hipLaunchKernelGGL(( vllm::rms_norm_kernel<scalar_t, scalar_t, false>), dim3(grid), dim3(block), 0, stream, out.data_ptr<scalar_t>(),
input.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
epsilon, epsilon,
...@@ -39,10 +40,10 @@ void layernorm_general(Tensor out, Tensor input, Tensor weight, Tensor bias, flo ...@@ -39,10 +40,10 @@ void layernorm_general(Tensor out, Tensor input, Tensor weight, Tensor bias, flo
size_t size_shmem = input.scalar_size() * hidden_size; size_t size_shmem = input.scalar_size() * hidden_size;
const cudaStream_t stream = getCurrentCUDAStream(); const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm", [&] { VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm", [&] {
using T = typename packed_as<scalar_t, 2>::type; using T = typename packed_as<scalar_t, 2>::type;
vllm::generalLayerNorm<T, half, true><<<grid, block, size_shmem, stream>>>( hipLaunchKernelGGL(( vllm::generalLayerNorm<T, half, true>), dim3(grid), dim3(block), size_shmem, stream,
reinterpret_cast<T *>(input.data_ptr<scalar_t>()), reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
weight.valid() ? reinterpret_cast<T *>(weight.data_ptr<scalar_t>()) : nullptr, weight.valid() ? reinterpret_cast<T *>(weight.data_ptr<scalar_t>()) : nullptr,
bias.valid() ? reinterpret_cast<T *>(bias.data_ptr<scalar_t>()) : nullptr, bias.valid() ? reinterpret_cast<T *>(bias.data_ptr<scalar_t>()) : nullptr,
...@@ -69,13 +70,13 @@ void rms_norm_general(Tensor &out, // [..., hidden_size] ...@@ -69,13 +70,13 @@ void rms_norm_general(Tensor &out, // [..., hidden_size]
dim3 block(std::min(hidden_size, 1024)); dim3 block(std::min(hidden_size, 1024));
block.x = 32 * ((block.x + 31) / 32); block.x = 32 * ((block.x + 31) / 32);
const cudaStream_t stream = getCurrentCUDAStream(); const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm", [&] { VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm", [&] {
using T = scalar_t; using T = scalar_t;
if (use_per_token_quant) { if (use_per_token_quant) {
// per-token // per-token
vllm::generalLayerNorm<T, half> hipLaunchKernelGGL(( vllm::generalLayerNorm<T, half>)
<<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()), , dim3(grid), dim3(block), 0, stream, reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
reinterpret_cast<T *>(weight.data_ptr<scalar_t>()), reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
nullptr, nullptr,
nullptr, nullptr,
...@@ -92,8 +93,8 @@ void rms_norm_general(Tensor &out, // [..., hidden_size] ...@@ -92,8 +93,8 @@ void rms_norm_general(Tensor &out, // [..., hidden_size]
// weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size); // weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
} else { } else {
// per-tensor // per-tensor
vllm::generalLayerNorm<T, half> hipLaunchKernelGGL(( vllm::generalLayerNorm<T, half>)
<<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()), , dim3(grid), dim3(block), 0, stream, reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
reinterpret_cast<T *>(weight.data_ptr<scalar_t>()), reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
nullptr, nullptr,
nullptr, nullptr,
...@@ -121,13 +122,13 @@ void rms_norm_general_fuse_sum(Tensor &out, // [..., hidden_size] ...@@ -121,13 +122,13 @@ void rms_norm_general_fuse_sum(Tensor &out, // [..., hidden_size]
dim3 block(std::min(hidden_size, 1024)); dim3 block(std::min(hidden_size, 1024));
block.x = 32 * ((block.x + 31) / 32); block.x = 32 * ((block.x + 31) / 32);
const cudaStream_t stream = getCurrentCUDAStream(); const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm_fuse_sum", [&] { VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm_fuse_sum", [&] {
using T = scalar_t; using T = scalar_t;
if (use_per_token_quant) { if (use_per_token_quant) {
// per-token // per-token
vllm::generalLayerNorm_fuse_sum<T, half> hipLaunchKernelGGL(( vllm::generalLayerNorm_fuse_sum<T, half>)
<<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()), , dim3(grid), dim3(block), 0, stream, reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
reinterpret_cast<T *>(weight.data_ptr<scalar_t>()), reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
nullptr, nullptr,
nullptr, nullptr,
...@@ -149,8 +150,8 @@ void rms_norm_general_fuse_sum(Tensor &out, // [..., hidden_size] ...@@ -149,8 +150,8 @@ void rms_norm_general_fuse_sum(Tensor &out, // [..., hidden_size]
// Not implemented per-tensor input_sum // Not implemented per-tensor input_sum
assert(false); assert(false);
vllm::generalLayerNorm_fuse_sum<T, half> hipLaunchKernelGGL(( vllm::generalLayerNorm_fuse_sum<T, half>)
<<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()), , dim3(grid), dim3(block), 0, stream, reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
reinterpret_cast<T *>(weight.data_ptr<scalar_t>()), reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
nullptr, nullptr,
nullptr, nullptr,
...@@ -176,10 +177,10 @@ void invoke_dequant_add_residual_rms_norm_quant(Tensor &out, // [..., hidde ...@@ -176,10 +177,10 @@ void invoke_dequant_add_residual_rms_norm_quant(Tensor &out, // [..., hidde
int num_tokens = input.numel() / hidden_size; int num_tokens = input.numel() / hidden_size;
dim3 grid(num_tokens); dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024)); dim3 block(std::min(hidden_size, 1024));
const cudaStream_t stream = getCurrentCUDAStream(); const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
VLLM_DISPATCH_FLOATING_TYPES(residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel", [&] { VLLM_DISPATCH_FLOATING_TYPES(residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel", [&] {
vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half, false> hipLaunchKernelGGL(( vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half, false>)
<<<grid, block, 0, stream>>>(input.data_ptr<int32_t>(), , dim3(grid), dim3(block), 0, stream, input.data_ptr<int32_t>(),
residual.data_ptr<scalar_t>(), residual.data_ptr<scalar_t>(),
out.data_ptr<int8_t>(), out.data_ptr<int8_t>(),
gamma.data_ptr<scalar_t>(), gamma.data_ptr<scalar_t>(),
...@@ -202,10 +203,10 @@ void invoke_dequant_add_residual_rms_norm_quant(Tensor &out, // [..., hidde ...@@ -202,10 +203,10 @@ void invoke_dequant_add_residual_rms_norm_quant(Tensor &out, // [..., hidde
dim3 grid(num_tokens); dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024)); dim3 block(std::min(hidden_size, 1024));
const cudaStream_t stream = getCurrentCUDAStream(); const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
VLLM_DISPATCH_FLOATING_TYPES(residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel", [&] { VLLM_DISPATCH_FLOATING_TYPES(residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel", [&] {
vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half *, true> hipLaunchKernelGGL(( vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half *, true>)
<<<grid, block, 0, stream>>>(input.data_ptr<int32_t>(), , dim3(grid), dim3(block), 0, stream, input.data_ptr<int32_t>(),
residual.data_ptr<scalar_t>(), residual.data_ptr<scalar_t>(),
out.data_ptr<int8_t>(), out.data_ptr<int8_t>(),
gamma.data_ptr<scalar_t>(), gamma.data_ptr<scalar_t>(),
......
#include <cuda_bf16.h> #include "hip/hip_runtime.h"
#include <hip/hip_bf16.h>
#define ENABLE_BF16 1 #define ENABLE_BF16 1
......
#include "hip/hip_runtime.h"
#include "misc_kernels_impl.cuh" #include "misc_kernels_impl.cuh"
#include "misc_kernels.h" #include "misc_kernels.h"
#include "dispatch_utils.h" #include "dispatch_utils.h"
...@@ -13,12 +14,12 @@ Tensor add(Tensor a, Tensor b) { ...@@ -13,12 +14,12 @@ Tensor add(Tensor a, Tensor b) {
int threadsPerBlock = 1024; int threadsPerBlock = 1024;
int blocksPerGrid = (a.numel() + threadsPerBlock - 1) / threadsPerBlock; int blocksPerGrid = (a.numel() + threadsPerBlock - 1) / threadsPerBlock;
auto stream = getCurrentCUDAStream(); auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
Tensor out = Tensor::empty_like(a); Tensor out = Tensor::empty_like(a);
dispatch(out.scalar_type(), [&]<typename scalar_t>() { dispatch(out.scalar_type(), [&]<typename scalar_t>() {
add_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>( hipLaunchKernelGGL(( add_kernel), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream,
a.data_ptr<scalar_t>(), b.data_ptr<scalar_t>(), out.data_ptr<scalar_t>(), out.numel()); a.data_ptr<scalar_t>(), b.data_ptr<scalar_t>(), out.data_ptr<scalar_t>(), out.numel());
}); });
...@@ -46,12 +47,12 @@ void mul_add(Tensor x, Tensor scale, Tensor bias) { ...@@ -46,12 +47,12 @@ void mul_add(Tensor x, Tensor scale, Tensor bias) {
int threadsPerBlock = 1024; int threadsPerBlock = 1024;
int blocksPerGrid = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll); int blocksPerGrid = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll);
auto stream = getCurrentCUDAStream(); auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
dispatch(x.scalar_type(), [&]<typename scalar_t>() { dispatch(x.scalar_type(), [&]<typename scalar_t>() {
if (scale.valid()) { if (scale.valid()) {
mul_add_kernel<scalar_t, unroll, false> hipLaunchKernelGGL(( mul_add_kernel<scalar_t, unroll, false>)
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(x.data_ptr<scalar_t>(), , dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream, x.data_ptr<scalar_t>(),
scale.data_ptr<scalar_t>(), scale.data_ptr<scalar_t>(),
bias.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),
0, 0,
...@@ -62,7 +63,7 @@ void mul_add(Tensor x, Tensor scale, Tensor bias) { ...@@ -62,7 +63,7 @@ void mul_add(Tensor x, Tensor scale, Tensor bias) {
0, 0,
0); 0);
} else { } else {
mul_add_kernel<scalar_t, unroll, true><<<blocksPerGrid, threadsPerBlock, 0, stream>>>( hipLaunchKernelGGL(( mul_add_kernel<scalar_t, unroll, true>), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream,
x.data_ptr<scalar_t>(), nullptr, bias.data_ptr<scalar_t>(), 0, x.numel(), 1, bias.numel(), 0, 0, 0); x.data_ptr<scalar_t>(), nullptr, bias.data_ptr<scalar_t>(), 0, x.numel(), 1, bias.numel(), 0, 0, 0);
} }
}); });
...@@ -96,12 +97,12 @@ void mul_add_batch(Tensor x, Tensor scale, bool batch_scale, double scale_shift, ...@@ -96,12 +97,12 @@ void mul_add_batch(Tensor x, Tensor scale, bool batch_scale, double scale_shift,
int threadsPerBlock = 1024; int threadsPerBlock = 1024;
dim3 grid(ceilDiv(numel, threadsPerBlock * unroll), batch_size); dim3 grid(ceilDiv(numel, threadsPerBlock * unroll), batch_size);
auto stream = getCurrentCUDAStream(); auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
dispatch(x.scalar_type(), [&]<typename scalar_t>() { dispatch(x.scalar_type(), [&]<typename scalar_t>() {
if (scale.valid()) { if (scale.valid()) {
mul_add_kernel<scalar_t, unroll, false> hipLaunchKernelGGL(( mul_add_kernel<scalar_t, unroll, false>)
<<<grid, threadsPerBlock, 0, stream>>>(x.data_ptr<scalar_t>(), , dim3(grid), dim3(threadsPerBlock), 0, stream, x.data_ptr<scalar_t>(),
scale.data_ptr<scalar_t>(), scale.data_ptr<scalar_t>(),
bias.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),
(scalar_t)scale_shift, (scalar_t)scale_shift,
...@@ -112,8 +113,8 @@ void mul_add_batch(Tensor x, Tensor scale, bool batch_scale, double scale_shift, ...@@ -112,8 +113,8 @@ void mul_add_batch(Tensor x, Tensor scale, bool batch_scale, double scale_shift,
batch_scale ? scale.stride(0) : 0, batch_scale ? scale.stride(0) : 0,
batch_bias ? bias.stride(0) : 0); batch_bias ? bias.stride(0) : 0);
} else { } else {
mul_add_kernel<scalar_t, unroll, true> hipLaunchKernelGGL(( mul_add_kernel<scalar_t, unroll, true>)
<<<grid, threadsPerBlock, 0, stream>>>(x.data_ptr<scalar_t>(), , dim3(grid), dim3(threadsPerBlock), 0, stream, x.data_ptr<scalar_t>(),
nullptr, nullptr,
bias.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(),
(scalar_t)scale_shift, (scalar_t)scale_shift,
...@@ -134,12 +135,12 @@ Tensor embedding(Tensor input_id, Tensor lookup) { ...@@ -134,12 +135,12 @@ Tensor embedding(Tensor input_id, Tensor lookup) {
auto shapeOut = input_id.shape; auto shapeOut = input_id.shape;
shapeOut.dataExtent.push_back(lookup.shape[-1]); shapeOut.dataExtent.push_back(lookup.shape[-1]);
auto stream = getCurrentCUDAStream(); auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
Tensor out = Tensor::empty(shapeOut, lookup.scalar_type(), input_id.device()); Tensor out = Tensor::empty(shapeOut, lookup.scalar_type(), input_id.device());
dispatch(out.scalar_type(), [&]<typename scalar_t>() { dispatch(out.scalar_type(), [&]<typename scalar_t>() {
EmbeddingKernel<<<input_id.numel(), std::min(lookup.shape[-1], 1024), 0, stream>>>( hipLaunchKernelGGL(( EmbeddingKernel), dim3(input_id.numel()), dim3(std::min(lookup.shape[-1], 1024)), 0, stream,
input_id.data_ptr<int32_t>(), out.data_ptr<scalar_t>(), lookup.data_ptr<scalar_t>(), lookup.shape[-1]); input_id.data_ptr<int32_t>(), out.data_ptr<scalar_t>(), lookup.data_ptr<scalar_t>(), lookup.shape[-1]);
}); });
...@@ -149,12 +150,12 @@ Tensor embedding(Tensor input_id, Tensor lookup) { ...@@ -149,12 +150,12 @@ Tensor embedding(Tensor input_id, Tensor lookup) {
Tensor argmax_sample(Tensor logits) { Tensor argmax_sample(Tensor logits) {
assert(logits.ndims() == 2); assert(logits.ndims() == 2);
auto stream = getCurrentCUDAStream(); auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
Tensor out = Tensor::empty({logits.shape[0]}, Tensor::INT32, logits.device()); Tensor out = Tensor::empty({logits.shape[0]}, Tensor::INT32, logits.device());
dispatch(logits.scalar_type(), [&]<typename scalar_t>() { dispatch(logits.scalar_type(), [&]<typename scalar_t>() {
argmax_sample_kernel<<<logits.shape[0], std::min(logits.shape[1], 1024), 0, stream>>>( hipLaunchKernelGGL(( argmax_sample_kernel), dim3(logits.shape[0]), dim3(std::min(logits.shape[1], 1024)), 0, stream,
logits.data_ptr<scalar_t>(), out.data_ptr<int32_t>(), logits.shape[1]); logits.data_ptr<scalar_t>(), out.data_ptr<int32_t>(), logits.shape[1]);
}); });
...@@ -167,7 +168,7 @@ void splitqkv(Tensor qkv, Tensor q, Tensor k, Tensor v) { ...@@ -167,7 +168,7 @@ void splitqkv(Tensor qkv, Tensor q, Tensor k, Tensor v) {
// assert(qkv.shape[0] == k.shape[0]); // assert(qkv.shape[0] == k.shape[0]);
// assert(qkv.shape[0] == v.shape[0]); // assert(qkv.shape[0] == v.shape[0]);
auto stream = getCurrentCUDAStream(); auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
int dim_q = q.shape[-1] * q.shape[-2]; int dim_q = q.shape[-1] * q.shape[-2];
int dim_k = k.shape[-1] * k.shape[-2]; int dim_k = k.shape[-1] * k.shape[-2];
...@@ -179,7 +180,7 @@ void splitqkv(Tensor qkv, Tensor q, Tensor k, Tensor v) { ...@@ -179,7 +180,7 @@ void splitqkv(Tensor qkv, Tensor q, Tensor k, Tensor v) {
int num_tokens = qkv.numel() / qkv.shape[-1]; int num_tokens = qkv.numel() / qkv.shape[-1];
dispatch(qkv.scalar_type(), [&]<typename scalar_t>() { dispatch(qkv.scalar_type(), [&]<typename scalar_t>() {
splitqkv_kernel<<<num_tokens, std::min(qkv.shape[-1], 1024), 0, stream>>>(qkv.data_ptr<scalar_t>(), hipLaunchKernelGGL(( splitqkv_kernel), dim3(num_tokens), dim3(std::min(qkv.shape[-1], 1024)), 0, stream, qkv.data_ptr<scalar_t>(),
q.data_ptr<scalar_t>(), q.data_ptr<scalar_t>(),
k.data_ptr<scalar_t>(), k.data_ptr<scalar_t>(),
v.data_ptr<scalar_t>(), v.data_ptr<scalar_t>(),
...@@ -195,7 +196,7 @@ std::array<Tensor, N> split_mod(Tensor input) { ...@@ -195,7 +196,7 @@ std::array<Tensor, N> split_mod(Tensor input) {
int threadsPerBlock = 1024; int threadsPerBlock = 1024;
int blocksPerGrid = (input.numel() + threadsPerBlock - 1) / threadsPerBlock; int blocksPerGrid = (input.numel() + threadsPerBlock - 1) / threadsPerBlock;
auto stream = getCurrentCUDAStream(); auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto shapeOut = TensorShape(input.shape.dataExtent); auto shapeOut = TensorShape(input.shape.dataExtent);
shapeOut[-1] /= N; shapeOut[-1] /= N;
...@@ -210,7 +211,7 @@ std::array<Tensor, N> split_mod(Tensor input) { ...@@ -210,7 +211,7 @@ std::array<Tensor, N> split_mod(Tensor input) {
for (int k = 0; k < N; k++) { for (int k = 0; k < N; k++) {
outPtr[k] = out[k].template data_ptr<scalar_t>(); outPtr[k] = out[k].template data_ptr<scalar_t>();
} }
split_mod_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>( hipLaunchKernelGGL(( split_mod_kernel), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream,
input.data_ptr<scalar_t>(), outPtr, input.numel()); input.data_ptr<scalar_t>(), outPtr, input.numel());
}); });
...@@ -227,10 +228,10 @@ Tensor quant_static(Tensor x, float scale) { ...@@ -227,10 +228,10 @@ Tensor quant_static(Tensor x, float scale) {
int threadsPerBlock = 1024; int threadsPerBlock = 1024;
int blocksPerGrid = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll); int blocksPerGrid = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll);
auto stream = getCurrentCUDAStream(); auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
dispatch(x.scalar_type(), [&]<typename scalar_t>() { dispatch(x.scalar_type(), [&]<typename scalar_t>() {
quant_kernel_static<scalar_t, unroll><<<blocksPerGrid, threadsPerBlock, 0, stream>>>( hipLaunchKernelGGL(( quant_kernel_static<scalar_t, unroll>), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream,
x.data_ptr<scalar_t>(), out.data_ptr<int8_t>(), (scalar_t)scale, x.numel()); x.data_ptr<scalar_t>(), out.data_ptr<int8_t>(), (scalar_t)scale, x.numel());
}); });
...@@ -247,10 +248,10 @@ Tensor quant_static_fuse_gelu(Tensor x, float scale) { ...@@ -247,10 +248,10 @@ Tensor quant_static_fuse_gelu(Tensor x, float scale) {
int threadsPerBlock = 1024; int threadsPerBlock = 1024;
int blocksPerGrid = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll); int blocksPerGrid = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll);
auto stream = getCurrentCUDAStream(); auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
dispatch(x.scalar_type(), [&]<typename scalar_t>() { dispatch(x.scalar_type(), [&]<typename scalar_t>() {
quant_kernel_static_fuse_gelu<scalar_t, unroll><<<blocksPerGrid, threadsPerBlock, 0, stream>>>( hipLaunchKernelGGL(( quant_kernel_static_fuse_gelu<scalar_t, unroll>), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream,
x.data_ptr<scalar_t>(), out.data_ptr<int8_t>(), (scalar_t)scale, x.numel()); x.data_ptr<scalar_t>(), out.data_ptr<int8_t>(), (scalar_t)scale, x.numel());
}); });
...@@ -266,7 +267,7 @@ void cast(Tensor input, Tensor output) { ...@@ -266,7 +267,7 @@ void cast(Tensor input, Tensor output) {
assert(input.scalar_size() == output.scalar_size()); assert(input.scalar_size() == output.scalar_size());
} }
auto stream = getCurrentCUDAStream(); auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
dispatch(input.scalar_type(), [&]<typename input_t>() { dispatch(input.scalar_type(), [&]<typename input_t>() {
dispatch(output.scalar_type(), [&]<typename output_t>() { dispatch(output.scalar_type(), [&]<typename output_t>() {
...@@ -275,10 +276,10 @@ void cast(Tensor input, Tensor output) { ...@@ -275,10 +276,10 @@ void cast(Tensor input, Tensor output) {
int threadsPerBlock = 1024; int threadsPerBlock = 1024;
int blocksPerGrid = (int)ceilDiv<int64_t>(input.numel(), threadsPerBlock * unroll); int blocksPerGrid = (int)ceilDiv<int64_t>(input.numel(), threadsPerBlock * unroll);
cast_kernel<input_t, output_t, unroll><<<blocksPerGrid, threadsPerBlock, 0, stream>>>( hipLaunchKernelGGL(( cast_kernel<input_t, output_t, unroll>), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream,
input.data_ptr<input_t>(), output.data_ptr<output_t>(), input.numel()); input.data_ptr<input_t>(), output.data_ptr<output_t>(), input.numel());
checkCUDA(cudaGetLastError()); checkCUDA(hipGetLastError());
}); });
}); });
} }
...@@ -298,7 +299,7 @@ Tensor topk(Tensor x, int k) { ...@@ -298,7 +299,7 @@ Tensor topk(Tensor x, int k) {
Tensor out = Tensor::empty(outShape, Tensor::INT32, x.device()); Tensor out = Tensor::empty(outShape, Tensor::INT32, x.device());
auto stream = getCurrentCUDAStream(); auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
dispatchVal(k, std::make_integer_sequence<int, MAXK + 1>(), [&]<int K>() { dispatchVal(k, std::make_integer_sequence<int, MAXK + 1>(), [&]<int K>() {
if constexpr (K == 0) { if constexpr (K == 0) {
...@@ -307,9 +308,9 @@ Tensor topk(Tensor x, int k) { ...@@ -307,9 +308,9 @@ Tensor topk(Tensor x, int k) {
} }
if constexpr (K > 0) { if constexpr (K > 0) {
dispatch(x.scalar_type(), [&]<typename scalar_t>() { dispatch(x.scalar_type(), [&]<typename scalar_t>() {
topk_kernel<scalar_t, K><<<ceilDiv(batch, 32), 32, 0, stream>>>( hipLaunchKernelGGL(( topk_kernel<scalar_t, K>), dim3(ceilDiv(batch, 32)), dim3(32), 0, stream,
x.data_ptr<scalar_t>(), out.data_ptr<int>(), N, x.stride(-2), batch); x.data_ptr<scalar_t>(), out.data_ptr<int>(), N, x.stride(-2), batch);
checkCUDA(cudaGetLastError()); checkCUDA(hipGetLastError());
}); });
} }
}); });
......
#include "hip/hip_runtime.h"
#include "reduction_utils.cuh" #include "reduction_utils.cuh"
#include <array> #include <array>
#include <cuda_fp16.h> #include <hip/hip_fp16.h>
#include <cuda_bf16.h> #include <hip/hip_bf16.h>
#include "utils.cuh" #include "utils.cuh"
#include "activation_kernels_impl.cuh" #include "activation_kernels_impl.cuh"
......
#include "hip/hip_runtime.h"
/* /*
* Adapted from * Adapted from
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh
......
#include "hip/hip_runtime.h"
// Adated from FasterTransformer, // Adated from FasterTransformer,
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp // https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
#pragma once #pragma once
...@@ -9,10 +10,10 @@ ...@@ -9,10 +10,10 @@
#include <cstdio> #include <cstdio>
#include <cuda_fp16.h> #include <hip/hip_fp16.h>
#ifdef ENABLE_BF16 #ifdef ENABLE_BF16
#include <cuda_bf16.h> #include <hip/hip_bf16.h>
#endif #endif
__device__ __forceinline__ static void trap_unsupported_arch() { __device__ __forceinline__ static void trap_unsupported_arch() {
...@@ -24,11 +25,11 @@ __device__ __forceinline__ static void trap_unsupported_arch() { ...@@ -24,11 +25,11 @@ __device__ __forceinline__ static void trap_unsupported_arch() {
__trap(); __trap();
} }
#if defined(ENABLE_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(ENABLE_BF16) && defined(__DTK_ARCH__) && __DTK_ARCH__ < 800
__device__ __forceinline__ static __nv_bfloat162 __device__ __forceinline__ static __hip_bfloat162
__hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) { __hfma2(const __hip_bfloat162 a, const __hip_bfloat162 b, const __hip_bfloat162 c) {
trap_unsupported_arch(); trap_unsupported_arch();
return __nv_bfloat162(0.0f, 0.0f); return __hip_bfloat162(0.0f, 0.0f);
} }
#endif #endif
...@@ -56,11 +57,11 @@ struct num_elems<half2> { ...@@ -56,11 +57,11 @@ struct num_elems<half2> {
}; };
#ifdef ENABLE_BF16 #ifdef ENABLE_BF16
template<> template<>
struct num_elems<__nv_bfloat16> { struct num_elems<__hip_bfloat16> {
static constexpr int value = 1; static constexpr int value = 1;
}; };
template<> template<>
struct num_elems<__nv_bfloat162> { struct num_elems<__hip_bfloat162> {
static constexpr int value = 2; static constexpr int value = 2;
}; };
#endif #endif
...@@ -107,12 +108,12 @@ struct packed_as<float2, 1> { ...@@ -107,12 +108,12 @@ struct packed_as<float2, 1> {
}; };
#ifdef ENABLE_BF16 #ifdef ENABLE_BF16
template<> template<>
struct packed_as<__nv_bfloat16, 2> { struct packed_as<__hip_bfloat16, 2> {
using type = __nv_bfloat162; using type = __hip_bfloat162;
}; };
template<> template<>
struct packed_as<__nv_bfloat162, 1> { struct packed_as<__hip_bfloat162, 1> {
using type = __nv_bfloat16; using type = __hip_bfloat16;
}; };
#endif #endif
#ifdef ENABLE_FP8 #ifdef ENABLE_FP8
...@@ -169,8 +170,8 @@ inline __device__ T ldg(const T *val) { ...@@ -169,8 +170,8 @@ inline __device__ T ldg(const T *val) {
#define bf1622float2 __bfloat1622float2 #define bf1622float2 __bfloat1622float2
#define float22bf162 __float22bfloat162_rn #define float22bf162 __float22bfloat162_rn
#define bf162bf162 __bfloat162bfloat162 #define bf162bf162 __bfloat162bfloat162
inline __device__ int16_t bf1622int16(__nv_bfloat162 val) { inline __device__ int16_t bf1622int16(__hip_bfloat162 val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__DTK_ARCH__) && __DTK_ARCH__ < 800
float2 f_val; float2 f_val;
f_val.x = max(min(__low2float(val), 127.f), -128.f); f_val.x = max(min(__low2float(val), 127.f), -128.f);
f_val.y = max(min(__high2float(val), 127.f), -128.f); f_val.y = max(min(__high2float(val), 127.f), -128.f);
...@@ -201,8 +202,8 @@ inline __device__ int16_t bf1622int16(__nv_bfloat162 val) { ...@@ -201,8 +202,8 @@ inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
#if ENABLE_BF16 #if ENABLE_BF16
template<> template<>
inline __device__ __nv_bfloat162 ldg(const __nv_bfloat162 *val) { inline __device__ __hip_bfloat162 ldg(const __hip_bfloat162 *val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__DTK_ARCH__) && __DTK_ARCH__ < 800
return val[0]; return val[0];
#else #else
return __ldg(val); return __ldg(val);
...@@ -210,8 +211,8 @@ inline __device__ __nv_bfloat162 ldg(const __nv_bfloat162 *val) { ...@@ -210,8 +211,8 @@ inline __device__ __nv_bfloat162 ldg(const __nv_bfloat162 *val) {
} }
template<> template<>
inline __device__ __nv_bfloat16 ldg(const __nv_bfloat16 *val) { inline __device__ __hip_bfloat16 ldg(const __hip_bfloat16 *val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__DTK_ARCH__) && __DTK_ARCH__ < 800
return val[0]; return val[0];
#else #else
return __ldg(val); return __ldg(val);
...@@ -330,81 +331,81 @@ __device__ inline float2 cuda_cast<float2, int16_t>(int16_t val) { ...@@ -330,81 +331,81 @@ __device__ inline float2 cuda_cast<float2, int16_t>(int16_t val) {
#ifdef ENABLE_BF16 #ifdef ENABLE_BF16
template<> template<>
__device__ inline __nv_bfloat16 cuda_cast(int32_t val) { __device__ inline __hip_bfloat16 cuda_cast(int32_t val) {
return static_cast<float>(val); return static_cast<float>(val);
} }
template<> template<>
__device__ inline __nv_bfloat16 cuda_cast(int8_t val) { __device__ inline __hip_bfloat16 cuda_cast(int8_t val) {
return static_cast<float>(val); return static_cast<float>(val);
} }
template<> template<>
__device__ inline int8_t cuda_cast(__nv_bfloat16 val) { __device__ inline int8_t cuda_cast(__hip_bfloat16 val) {
return static_cast<float>(val); return static_cast<float>(val);
} }
template<> template<>
__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) { __device__ inline float cuda_cast<float, __hip_bfloat16>(__hip_bfloat16 val) {
return __bfloat162float(val); return __bfloat162float(val);
} }
template<> template<>
__device__ inline float2 cuda_cast<float2, __nv_bfloat162>(__nv_bfloat162 val) { __device__ inline float2 cuda_cast<float2, __hip_bfloat162>(__hip_bfloat162 val) {
return bf1622float2(val); return bf1622float2(val);
} }
template<> template<>
__device__ inline half cuda_cast<half, __nv_bfloat16>(__nv_bfloat16 val) { __device__ inline half cuda_cast<half, __hip_bfloat16>(__hip_bfloat16 val) {
return __float2half(__bfloat162float(val)); return __float2half(__bfloat162float(val));
} }
template<> template<>
__device__ inline int16_t cuda_cast<int16_t, __nv_bfloat162>(__nv_bfloat162 val) { __device__ inline int16_t cuda_cast<int16_t, __hip_bfloat162>(__hip_bfloat162 val) {
return bf1622int16(val); return bf1622int16(val);
} }
template<> template<>
__device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, float>(float val) { __device__ inline __hip_bfloat16 cuda_cast<__hip_bfloat16, float>(float val) {
return __float2bfloat16(val); return __float2bfloat16(val);
} }
template<> template<>
__device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, half>(half val) { __device__ inline __hip_bfloat16 cuda_cast<__hip_bfloat16, half>(half val) {
return __float2bfloat16(__half2float(val)); return __float2bfloat16(__half2float(val));
} }
template<> template<>
__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, __nv_bfloat16>(__nv_bfloat16 val) { __device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, __hip_bfloat16>(__hip_bfloat16 val) {
return bf162bf162(val); return bf162bf162(val);
} }
template<> template<>
__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float>(float val) { __device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, float>(float val) {
return __float2bfloat162_rn(val); return __float2bfloat162_rn(val);
} }
template<> template<>
__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float2>(float2 val) { __device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, float2>(float2 val) {
return float22bf162(val); return float22bf162(val);
} }
template<> template<>
__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, int16_t>(int16_t val) { __device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, int16_t>(int16_t val) {
union { union {
int8_t int8[2]; int8_t int8[2];
int16_t int16; int16_t int16;
}; };
int16 = val; int16 = val;
__nv_bfloat162 res; __hip_bfloat162 res;
res.x = cuda_cast<__nv_bfloat16>(int8[0]); res.x = cuda_cast<__hip_bfloat16>(int8[0]);
res.y = cuda_cast<__nv_bfloat16>(int8[1]); res.y = cuda_cast<__hip_bfloat16>(int8[1]);
return res; return res;
} }
template<> template<>
__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, half2>(half2 val) { __device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, half2>(half2 val) {
return float22bf162(__half22float2(val)); return float22bf162(__half22float2(val));
} }
...@@ -420,7 +421,7 @@ __device__ __forceinline__ packed_as<half, 2>::type f162f162<half>(half x) { ...@@ -420,7 +421,7 @@ __device__ __forceinline__ packed_as<half, 2>::type f162f162<half>(half x) {
#ifdef ENABLE_BF16 #ifdef ENABLE_BF16
template<> template<>
__device__ __forceinline__ packed_as<__nv_bfloat16, 2>::type f162f162<__nv_bfloat16>(__nv_bfloat16 x) { __device__ __forceinline__ packed_as<__hip_bfloat16, 2>::type f162f162<__hip_bfloat16>(__hip_bfloat16 x) {
return __bfloat162bfloat162(x); return __bfloat162bfloat162(x);
} }
#endif #endif
...@@ -453,8 +454,8 @@ __device__ inline half cuda_max(half2 val) { ...@@ -453,8 +454,8 @@ __device__ inline half cuda_max(half2 val) {
#ifdef ENABLE_BF16 #ifdef ENABLE_BF16
template<> template<>
__device__ inline __nv_bfloat16 cuda_max(__nv_bfloat162 val) { __device__ inline __hip_bfloat16 cuda_max(__hip_bfloat162 val) {
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) #if (defined(__DTK_ARCH__) && (__DTK_ARCH__ >= 800))
return __hmax(val.x, val.y); return __hmax(val.x, val.y);
#else #else
assert(false); assert(false);
...@@ -497,14 +498,14 @@ __device__ inline half2 cuda_abs(half2 val) { ...@@ -497,14 +498,14 @@ __device__ inline half2 cuda_abs(half2 val) {
#ifdef ENABLE_BF16 #ifdef ENABLE_BF16
#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) #if __DTK_ARCH__ >= 800 || !defined(__DTK_ARCH__)
template<> template<>
__device__ inline __nv_bfloat16 cuda_abs(__nv_bfloat16 val) { __device__ inline __hip_bfloat16 cuda_abs(__hip_bfloat16 val) {
return __habs(val); return __habs(val);
} }
template<> template<>
__device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) { __device__ inline __hip_bfloat162 cuda_abs(__hip_bfloat162 val) {
return __habs2(val); return __habs2(val);
} }
#endif #endif
......
#include "hip/hip_runtime.h"
#pragma once #pragma once
#include "gemm_base.cuh" #include "gemm_base.cuh"
...@@ -26,8 +27,8 @@ struct AttentionFP16Config { ...@@ -26,8 +27,8 @@ struct AttentionFP16Config {
using half_t = half; using half_t = half;
using half2_t = half2; using half2_t = half2;
using epilogue_half_t = typename std::conditional_t<bf16out, __nv_bfloat16, half>; using epilogue_half_t = typename std::conditional_t<bf16out, __hip_bfloat16, half>;
using epilogue_half2_t = typename std::conditional_t<bf16out, __nv_bfloat162, half2>; using epilogue_half2_t = typename std::conditional_t<bf16out, __hip_bfloat162, half2>;
}; };
using AttentionFP16Config_FP16 = AttentionFP16Config<false>; using AttentionFP16Config_FP16 = AttentionFP16Config<false>;
...@@ -60,7 +61,7 @@ public: ...@@ -60,7 +61,7 @@ public:
using typename AttentionConfig::epilogue_half_t; using typename AttentionConfig::epilogue_half_t;
using typename AttentionConfig::epilogue_half2_t; using typename AttentionConfig::epilogue_half2_t;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #if defined(__DTK_ARCH__) && __DTK_ARCH__ >= 800
static constexpr bool IS_SM80 = true; static constexpr bool IS_SM80 = true;
#else #else
static constexpr bool IS_SM80 = false; static constexpr bool IS_SM80 = false;
...@@ -657,7 +658,7 @@ public: ...@@ -657,7 +658,7 @@ public:
template<typename Epilogue> template<typename Epilogue>
struct attention_fp16_kernel { struct attention_fp16_kernel {
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750; static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
static constexpr int SHMEM_SIZE = 0; // sizeof(q_shmem_t); static constexpr int SHMEM_SIZE = 0; // sizeof(q_shmem_t);
__device__ void operator()(const packed_q_t *ptr_q, __device__ void operator()(const packed_q_t *ptr_q,
......
#include "hip/hip_runtime.h"
#include "zgemm.h" #include "zgemm.h"
#include "attention.cuh" #include "attention.cuh"
...@@ -71,10 +72,10 @@ void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM] ...@@ -71,10 +72,10 @@ void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM]
shmem = std::max(shmem, Attention::template attention_fp16_kernel<Epilogue>::SHMEM_SIZE); shmem = std::max(shmem, Attention::template attention_fp16_kernel<Epilogue>::SHMEM_SIZE);
if (shmem >= 24 * 1024) { if (shmem >= 24 * 1024) {
checkCUDA(cudaFuncSetAttribute(func, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); checkCUDA(hipFuncSetAttribute(func, hipFuncAttributeMaxDynamicSharedMemorySize, shmem));
} }
func<<<grid, GEMM::WARP_SIZE * GEMM::NUM_WARPS, shmem, getCurrentCUDAStream()>>>(q.data_ptr<packed_q_t>(), hipLaunchKernelGGL(( func), dim3(grid), dim3(GEMM::WARP_SIZE * GEMM::NUM_WARPS), shmem, getCurrentHIPStreamMasqueradingAsCUDA(), q.data_ptr<packed_q_t>(),
k.data_ptr<packed_k_t>(), k.data_ptr<packed_k_t>(),
v.data_ptr<packed_v_t>(), v.data_ptr<packed_v_t>(),
scale, scale,
...@@ -82,7 +83,7 @@ void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM] ...@@ -82,7 +83,7 @@ void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM]
numTokensKV, numTokensKV,
args, args,
false); false);
checkCUDA(cudaGetLastError()); checkCUDA(hipGetLastError());
}; };
launch.template operator()<typename GEMM::EpilogueDefault>(typename GEMM::EpilogueDefault::Arguments{ launch.template operator()<typename GEMM::EpilogueDefault>(typename GEMM::EpilogueDefault::Arguments{
......
#include "hip/hip_runtime.h"
#pragma once #pragma once
#include "gemm_base.cuh" #include "gemm_base.cuh"
...@@ -702,7 +703,7 @@ public: ...@@ -702,7 +703,7 @@ public:
// q: [batch_size, #blocks, block_size, #heads, HEAD_DIM] // q: [batch_size, #blocks, block_size, #heads, HEAD_DIM]
// vk: [batch_size, #heads, HEAD_DIM+1, HEAD_DIM] // vk: [batch_size, #heads, HEAD_DIM+1, HEAD_DIM]
struct vk_mul_q_kernel { struct vk_mul_q_kernel {
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750; static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
// FIXME FIXME FIXME // FIXME FIXME FIXME
__device__ void operator()(half_t *q, const float *vk, float eps, int num_tokens) { __device__ void operator()(half_t *q, const float *vk, float eps, int num_tokens) {
const int block_id = blockIdx.x; const int block_id = blockIdx.x;
...@@ -762,7 +763,7 @@ public: ...@@ -762,7 +763,7 @@ public:
template<typename Epilogue> template<typename Epilogue>
struct test_epilogue_kernel { struct test_epilogue_kernel {
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750; static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
static constexpr size_t SHMEM_PER_WARP = static constexpr size_t SHMEM_PER_WARP =
ceilDiv<size_t>(Base::template load_act_to_fpsum<false>::SHMEM_SIZE, 128) * 128; ceilDiv<size_t>(Base::template load_act_to_fpsum<false>::SHMEM_SIZE, 128) * 128;
static constexpr size_t SHMEM_SIZE = SHMEM_PER_WARP * NUM_WARPS; static constexpr size_t SHMEM_SIZE = SHMEM_PER_WARP * NUM_WARPS;
......
#include "hip/hip_runtime.h"
#pragma once #pragma once
#include "common.h" #include "common.h"
...@@ -44,8 +45,8 @@ public: ...@@ -44,8 +45,8 @@ public:
// may generate incorrect results in certain circumstances // may generate incorrect results in certain circumstances
static constexpr bool FASTER_I2F = faster_i2f; static constexpr bool FASTER_I2F = faster_i2f;
using half_t = typename std::conditional_t<bf16, __nv_bfloat16, half>; using half_t = typename std::conditional_t<bf16, __hip_bfloat16, half>;
using half2_t = typename std::conditional_t<bf16, __nv_bfloat162, half2>; using half2_t = typename std::conditional_t<bf16, __hip_bfloat162, half2>;
}; };
using GEMMConfig_W4A4_FP16 = GEMMConfig_W4A4<false>; using GEMMConfig_W4A4_FP16 = GEMMConfig_W4A4<false>;
...@@ -67,8 +68,8 @@ public: ...@@ -67,8 +68,8 @@ public:
using half_t = half; using half_t = half;
using half2_t = half2; using half2_t = half2;
#else #else
using half_t = __nv_bfloat16; using half_t = __hip_bfloat16;
using half2_t = __nv_bfloat162; using half2_t = __hip_bfloat162;
#endif #endif
}; };
...@@ -202,9 +203,9 @@ public: ...@@ -202,9 +203,9 @@ public:
__device__ __forceinline__ static packed_f32psum_t __device__ __forceinline__ static packed_f32psum_t
mma_f16xf16_f32(packed_fpsum_t a, packed_fpsum_t b, packed_f32psum_t psum) { mma_f16xf16_f32(packed_fpsum_t a, packed_fpsum_t b, packed_f32psum_t psum) {
static_assert(std::is_same_v<half_t, half> || std::is_same_v<half_t, __nv_bfloat16>); static_assert(std::is_same_v<half_t, half> || std::is_same_v<half_t, __hip_bfloat16>);
static constexpr bool is_bf16 = std::is_same_v<half_t, __nv_bfloat16>; static constexpr bool is_bf16 = std::is_same_v<half_t, __hip_bfloat16>;
uint4 out1 = mma_m16n8k16_f32f16f16f32<is_bf16>( uint4 out1 = mma_m16n8k16_f32f16f16f32<is_bf16>(
kernels::bit_cast<uint4>(a), kernels::bit_cast<uint4>(a),
...@@ -890,8 +891,8 @@ constexpr int max_arch() { ...@@ -890,8 +891,8 @@ constexpr int max_arch() {
template<typename kernel, typename... T> template<typename kernel, typename... T>
__global__ static void invoke_kernel(T... args) { __global__ static void invoke_kernel(T... args) {
#ifdef __CUDA_ARCH__ #ifdef __DTK_ARCH__
if constexpr (__CUDA_ARCH__ >= min_arch<kernel>() && __CUDA_ARCH__ <= max_arch<kernel>()) { if constexpr (__DTK_ARCH__ >= min_arch<kernel>() && __DTK_ARCH__ <= max_arch<kernel>()) {
kernel()(args...); kernel()(args...);
} else { } else {
trap_unsupported_arch(); trap_unsupported_arch();
...@@ -916,8 +917,8 @@ template<typename T> ...@@ -916,8 +917,8 @@ template<typename T>
static void test_sizeof() { static void test_sizeof() {
printf("typeid = %s\n", typeid(T).name()); printf("typeid = %s\n", typeid(T).name());
test_sizeof_host<T>(); test_sizeof_host<T>();
test_sizeof_device<T><<<1, 1>>>(); hipLaunchKernelGGL(( test_sizeof_device<T>), dim3(1), dim3(1), 0, 0, );
checkCUDA(cudaDeviceSynchronize()); checkCUDA(hipDeviceSynchronize());
} }
}; // namespace nunchaku::kernels }; // namespace nunchaku::kernels
...@@ -163,7 +163,7 @@ __device__ __forceinline__ static float2 half22float2(half2 val) { ...@@ -163,7 +163,7 @@ __device__ __forceinline__ static float2 half22float2(half2 val) {
return __half22float2(val); return __half22float2(val);
} }
__device__ __forceinline__ static float2 half22float2(__nv_bfloat162 val) { __device__ __forceinline__ static float2 half22float2(__hip_bfloat162 val) {
return __bfloat1622float2(val); return __bfloat1622float2(val);
} }
...@@ -176,7 +176,7 @@ __device__ __forceinline__ half2 float22half2<half2>(float2 val) { ...@@ -176,7 +176,7 @@ __device__ __forceinline__ half2 float22half2<half2>(float2 val) {
} }
template<> template<>
__device__ __forceinline__ __nv_bfloat162 float22half2<__nv_bfloat162>(float2 val) { __device__ __forceinline__ __hip_bfloat162 float22half2<__hip_bfloat162>(float2 val) {
return __float22bfloat162_rn(val); return __float22bfloat162_rn(val);
} }
...@@ -334,13 +334,13 @@ __device__ __forceinline__ static half2 h2div(half2 a, half2 b) { ...@@ -334,13 +334,13 @@ __device__ __forceinline__ static half2 h2div(half2 a, half2 b) {
of.y = __fdividef(af.y, bf.y); of.y = __fdividef(af.y, bf.y);
return float22half2<half2>(of); return float22half2<half2>(of);
}; };
__device__ __forceinline__ static __nv_bfloat162 h2div(__nv_bfloat162 a, __nv_bfloat162 b) { __device__ __forceinline__ static __hip_bfloat162 h2div(__hip_bfloat162 a, __hip_bfloat162 b) {
float2 af = half22float2(a); float2 af = half22float2(a);
float2 bf = half22float2(b); float2 bf = half22float2(b);
float2 of; float2 of;
of.x = __fdividef(af.x, bf.x); of.x = __fdividef(af.x, bf.x);
of.y = __fdividef(af.y, bf.y); of.y = __fdividef(af.y, bf.y);
return float22half2<__nv_bfloat162>(of); return float22half2<__hip_bfloat162>(of);
}; };
__device__ __forceinline__ static void reduce_add(float *addr, float val) { __device__ __forceinline__ static void reduce_add(float *addr, float val) {
......
#include "hip/hip_runtime.h"
#pragma once #pragma once
#include "gemm_base.cuh" #include "gemm_base.cuh"
...@@ -25,7 +26,7 @@ public: ...@@ -25,7 +26,7 @@ public:
// micro-scales for FP4 MMA // micro-scales for FP4 MMA
// each uint32_t is a 4*32 matrix of scales (for MMA of 64*32) // each uint32_t is a 4*32 matrix of scales (for MMA of 64*32)
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1200 #if defined(__DTK_ARCH__) && __DTK_ARCH__ >= 1200
static constexpr bool FP4_AVAILABLE = true; static constexpr bool FP4_AVAILABLE = true;
#else #else
static constexpr bool FP4_AVAILABLE = false; static constexpr bool FP4_AVAILABLE = false;
...@@ -623,7 +624,7 @@ public: ...@@ -623,7 +624,7 @@ public:
// each thread block (1 warp) quantize WARP_M * WARP_K tile (32 * 64) // each thread block (1 warp) quantize WARP_M * WARP_K tile (32 * 64)
struct quantize_w4a4_act_kernel { struct quantize_w4a4_act_kernel {
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750; static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
__device__ void operator()(const half_t *input, packed_act_t *output, packed_ascale_t *oscales, int K) { __device__ void operator()(const half_t *input, packed_act_t *output, packed_ascale_t *oscales, int K) {
const int laneId = threadIdx.x % WARP_SIZE; const int laneId = threadIdx.x % WARP_SIZE;
...@@ -660,7 +661,7 @@ public: ...@@ -660,7 +661,7 @@ public:
// each thread block (1 warp) quantize WARP_N * WARP_K tile (128 * 64) // each thread block (1 warp) quantize WARP_N * WARP_K tile (128 * 64)
struct quantize_w4a4_wgt_kernel { struct quantize_w4a4_wgt_kernel {
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750; static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
__device__ void operator()(const half_t *input, packed_wgt_t *output, packed_wscale_t *oscales, int K) { __device__ void operator()(const half_t *input, packed_wgt_t *output, packed_wscale_t *oscales, int K) {
const int laneId = threadIdx.x % WARP_SIZE; const int laneId = threadIdx.x % WARP_SIZE;
...@@ -721,9 +722,9 @@ public: ...@@ -721,9 +722,9 @@ public:
template<bool ACT_UNSIGNED, typename T> template<bool ACT_UNSIGNED, typename T>
__device__ __forceinline__ static void __device__ __forceinline__ static void
compute(act_warp A, wgt_warp W, ascale_warp ascale, wscale_warp wscale, T &fpsum) { compute(act_warp A, wgt_warp W, ascale_warp ascale, wscale_warp wscale, T &fpsum) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 800 #if defined(__DTK_ARCH__) && __DTK_ARCH__ == 800
using int2half2 = i2f_sm80; using int2half2 = i2f_sm80;
#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 #elif defined(__DTK_ARCH__) && __DTK_ARCH__ == 750
using int2half2 = std::conditional_t<Config::FASTER_I2F, i2f_sm75_fast, i2f_sm75>; using int2half2 = std::conditional_t<Config::FASTER_I2F, i2f_sm75_fast, i2f_sm75>;
; ;
#else #else
...@@ -901,7 +902,7 @@ public: ...@@ -901,7 +902,7 @@ public:
compute<ACT_UNSIGNED>(A[k2], W[k2], ascale[k2], wscale[k2], fpsum); compute<ACT_UNSIGNED>(A[k2], W[k2], ascale[k2], wscale[k2], fpsum);
// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 // #if defined(__DTK_ARCH__) && __DTK_ARCH__ >= 800
if (alwaysfalse) { if (alwaysfalse) {
dummy = clock(); dummy = clock();
} }
...@@ -1045,7 +1046,7 @@ public: ...@@ -1045,7 +1046,7 @@ public:
template<typename Epilogue, bool ACT_UNSIGNED> template<typename Epilogue, bool ACT_UNSIGNED>
struct gemm_w4a4_kernel { struct gemm_w4a4_kernel {
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750; static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
static constexpr int MAX_ARCH = Config::FASTER_I2F ? 750 : INT_MAX; // FASTER_I2F is only needed on sm_75 static constexpr int MAX_ARCH = Config::FASTER_I2F ? 750 : INT_MAX; // FASTER_I2F is only needed on sm_75
__device__ void operator()(const packed_act_t *act, __device__ void operator()(const packed_act_t *act,
...@@ -1098,7 +1099,7 @@ public: ...@@ -1098,7 +1099,7 @@ public:
struct quantize_w4a4_fuse_lora_kernel { struct quantize_w4a4_fuse_lora_kernel {
using oscales_t = typename std::conditional_t<use_fp4, packed_amscale_t, packed_ascale_t>; using oscales_t = typename std::conditional_t<use_fp4, packed_amscale_t, packed_ascale_t>;
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750; static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
static constexpr size_t SHMEM_PER_WARP = static constexpr size_t SHMEM_PER_WARP =
ceilDiv<size_t>(Base::template load_act_to_fpsum<fuse_glu>::SHMEM_SIZE, 128) * 128; ceilDiv<size_t>(Base::template load_act_to_fpsum<fuse_glu>::SHMEM_SIZE, 128) * 128;
static constexpr size_t SHMEM_SIZE = SHMEM_PER_WARP * NUM_WARPS; static constexpr size_t SHMEM_SIZE = SHMEM_PER_WARP * NUM_WARPS;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment