Commit 0a7c8614 authored by fengzch-das's avatar fengzch-das
Browse files

Revert "hipify code"

This reverts commit 1a8114bf
parent 1a8114bf
Pipeline #3050 failed with stages
in 0 seconds
......@@ -3,7 +3,7 @@
#include "dispatch_cutlass.h"
#include <hip/hip_runtime.h>
#include <cuda_runtime.h>
#include "cutlass/cutlass.h"
#include "cutlass/conv/device/direct_convolution.h"
......@@ -74,7 +74,7 @@ static cutlass::Status depthwise_conv2d_kernel_run(cutlass::conv::Conv2dProblemS
UnderlyingKernel::ElementA *A, UnderlyingKernel::ElementB *B,
UnderlyingKernel::ElementC *C, UnderlyingKernel::ElementC *D,
ElementCompute alpha, ElementCompute beta, std::string split_k_mode,
hipStream_t stream, int device_id = 0)
cudaStream_t stream, int device_id = 0)
{
// create the tensor references
cutlass::Tensor4DCoord tensor_coord_A = cutlass::conv::implicit_gemm_tensor_a_extent(
......@@ -183,7 +183,7 @@ Tensor depthwise_conv2d_kernel(Tensor A, Tensor B) {
Tensor D = Tensor::allocate({N, P, Q, K}, A.dtype(), A.device());
auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto stream = getCurrentCUDAStream();
cutlass::Status status = depthwise_conv2d_kernel_run(
&problem_size,
......@@ -319,7 +319,7 @@ Tensor dwconv_f16(Tensor input, Tensor weight, Tensor out, Tensor bias) {
size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
BufferCUDA workspace(workspace_size);
auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto stream = getCurrentCUDAStream();
cutlass::Status status = implicit_gemm_op.can_implement(arguments);
if (status != cutlass::Status::kSuccess) {
......
#include "hip/hip_runtime.h"
#include "layernorm_kernels_impl.cuh"
#include "dispatch_utils.h"
......@@ -11,17 +10,17 @@ void rms_norm(Tensor &out, // [..., hidden_size]
int num_tokens = input.numel() / hidden_size;
dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024));
const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
const cudaStream_t stream = getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
if (use_quant) {
hipLaunchKernelGGL(( vllm::rms_norm_kernel<scalar_t, int8_t, true>), dim3(grid), dim3(block), 0, stream, out.data_ptr<int8_t>(),
vllm::rms_norm_kernel<scalar_t, int8_t, true><<<grid, block, 0, stream>>>(out.data_ptr<int8_t>(),
input.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
epsilon,
num_tokens,
hidden_size);
} else {
hipLaunchKernelGGL(( vllm::rms_norm_kernel<scalar_t, scalar_t, false>), dim3(grid), dim3(block), 0, stream, out.data_ptr<scalar_t>(),
vllm::rms_norm_kernel<scalar_t, scalar_t, false><<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),
input.data_ptr<scalar_t>(),
weight.data_ptr<scalar_t>(),
epsilon,
......@@ -40,10 +39,10 @@ void layernorm_general(Tensor out, Tensor input, Tensor weight, Tensor bias, flo
size_t size_shmem = input.scalar_size() * hidden_size;
const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
const cudaStream_t stream = getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm", [&] {
using T = typename packed_as<scalar_t, 2>::type;
hipLaunchKernelGGL(( vllm::generalLayerNorm<T, half, true>), dim3(grid), dim3(block), size_shmem, stream,
vllm::generalLayerNorm<T, half, true><<<grid, block, size_shmem, stream>>>(
reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
weight.valid() ? reinterpret_cast<T *>(weight.data_ptr<scalar_t>()) : nullptr,
bias.valid() ? reinterpret_cast<T *>(bias.data_ptr<scalar_t>()) : nullptr,
......@@ -70,13 +69,13 @@ void rms_norm_general(Tensor &out, // [..., hidden_size]
dim3 block(std::min(hidden_size, 1024));
block.x = 32 * ((block.x + 31) / 32);
const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
const cudaStream_t stream = getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm", [&] {
using T = scalar_t;
if (use_per_token_quant) {
// per-token
hipLaunchKernelGGL(( vllm::generalLayerNorm<T, half>)
, dim3(grid), dim3(block), 0, stream, reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
vllm::generalLayerNorm<T, half>
<<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
nullptr,
nullptr,
......@@ -93,8 +92,8 @@ void rms_norm_general(Tensor &out, // [..., hidden_size]
// weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
} else {
// per-tensor
hipLaunchKernelGGL(( vllm::generalLayerNorm<T, half>)
, dim3(grid), dim3(block), 0, stream, reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
vllm::generalLayerNorm<T, half>
<<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
nullptr,
nullptr,
......@@ -122,13 +121,13 @@ void rms_norm_general_fuse_sum(Tensor &out, // [..., hidden_size]
dim3 block(std::min(hidden_size, 1024));
block.x = 32 * ((block.x + 31) / 32);
const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
const cudaStream_t stream = getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm_fuse_sum", [&] {
using T = scalar_t;
if (use_per_token_quant) {
// per-token
hipLaunchKernelGGL(( vllm::generalLayerNorm_fuse_sum<T, half>)
, dim3(grid), dim3(block), 0, stream, reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
vllm::generalLayerNorm_fuse_sum<T, half>
<<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
nullptr,
nullptr,
......@@ -150,8 +149,8 @@ void rms_norm_general_fuse_sum(Tensor &out, // [..., hidden_size]
// Not implemented per-tensor input_sum
assert(false);
hipLaunchKernelGGL(( vllm::generalLayerNorm_fuse_sum<T, half>)
, dim3(grid), dim3(block), 0, stream, reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
vllm::generalLayerNorm_fuse_sum<T, half>
<<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
nullptr,
nullptr,
......@@ -177,10 +176,10 @@ void invoke_dequant_add_residual_rms_norm_quant(Tensor &out, // [..., hidde
int num_tokens = input.numel() / hidden_size;
dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024));
const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
const cudaStream_t stream = getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel", [&] {
hipLaunchKernelGGL(( vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half, false>)
, dim3(grid), dim3(block), 0, stream, input.data_ptr<int32_t>(),
vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half, false>
<<<grid, block, 0, stream>>>(input.data_ptr<int32_t>(),
residual.data_ptr<scalar_t>(),
out.data_ptr<int8_t>(),
gamma.data_ptr<scalar_t>(),
......@@ -203,10 +202,10 @@ void invoke_dequant_add_residual_rms_norm_quant(Tensor &out, // [..., hidde
dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024));
const hipStream_t stream = getCurrentHIPStreamMasqueradingAsCUDA();
const cudaStream_t stream = getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel", [&] {
hipLaunchKernelGGL(( vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half *, true>)
, dim3(grid), dim3(block), 0, stream, input.data_ptr<int32_t>(),
vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half *, true>
<<<grid, block, 0, stream>>>(input.data_ptr<int32_t>(),
residual.data_ptr<scalar_t>(),
out.data_ptr<int8_t>(),
gamma.data_ptr<scalar_t>(),
......
......@@ -2,7 +2,7 @@
#include "common.h"
#include "Tensor.h"
#include <hip/hip_fp16.h>
#include <cuda_fp16.h>
void rms_norm(Tensor &out, // [num_tokens, hidden_size]
Tensor &input, // [num_tokens, hidden_size]
......
#include "hip/hip_runtime.h"
#include <hip/hip_bf16.h>
#include <cuda_bf16.h>
#define ENABLE_BF16 1
......
#include "hip/hip_runtime.h"
#include "misc_kernels_impl.cuh"
#include "misc_kernels.h"
#include "dispatch_utils.h"
......@@ -14,12 +13,12 @@ Tensor add(Tensor a, Tensor b) {
int threadsPerBlock = 1024;
int blocksPerGrid = (a.numel() + threadsPerBlock - 1) / threadsPerBlock;
auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto stream = getCurrentCUDAStream();
Tensor out = Tensor::empty_like(a);
dispatch(out.scalar_type(), [&]<typename scalar_t>() {
hipLaunchKernelGGL(( add_kernel), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream,
add_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
a.data_ptr<scalar_t>(), b.data_ptr<scalar_t>(), out.data_ptr<scalar_t>(), out.numel());
});
......@@ -47,12 +46,12 @@ void mul_add(Tensor x, Tensor scale, Tensor bias) {
int threadsPerBlock = 1024;
int blocksPerGrid = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll);
auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto stream = getCurrentCUDAStream();
dispatch(x.scalar_type(), [&]<typename scalar_t>() {
if (scale.valid()) {
hipLaunchKernelGGL(( mul_add_kernel<scalar_t, unroll, false>)
, dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream, x.data_ptr<scalar_t>(),
mul_add_kernel<scalar_t, unroll, false>
<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(x.data_ptr<scalar_t>(),
scale.data_ptr<scalar_t>(),
bias.data_ptr<scalar_t>(),
0,
......@@ -63,7 +62,7 @@ void mul_add(Tensor x, Tensor scale, Tensor bias) {
0,
0);
} else {
hipLaunchKernelGGL(( mul_add_kernel<scalar_t, unroll, true>), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream,
mul_add_kernel<scalar_t, unroll, true><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
x.data_ptr<scalar_t>(), nullptr, bias.data_ptr<scalar_t>(), 0, x.numel(), 1, bias.numel(), 0, 0, 0);
}
});
......@@ -97,12 +96,12 @@ void mul_add_batch(Tensor x, Tensor scale, bool batch_scale, double scale_shift,
int threadsPerBlock = 1024;
dim3 grid(ceilDiv(numel, threadsPerBlock * unroll), batch_size);
auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto stream = getCurrentCUDAStream();
dispatch(x.scalar_type(), [&]<typename scalar_t>() {
if (scale.valid()) {
hipLaunchKernelGGL(( mul_add_kernel<scalar_t, unroll, false>)
, dim3(grid), dim3(threadsPerBlock), 0, stream, x.data_ptr<scalar_t>(),
mul_add_kernel<scalar_t, unroll, false>
<<<grid, threadsPerBlock, 0, stream>>>(x.data_ptr<scalar_t>(),
scale.data_ptr<scalar_t>(),
bias.data_ptr<scalar_t>(),
(scalar_t)scale_shift,
......@@ -113,8 +112,8 @@ void mul_add_batch(Tensor x, Tensor scale, bool batch_scale, double scale_shift,
batch_scale ? scale.stride(0) : 0,
batch_bias ? bias.stride(0) : 0);
} else {
hipLaunchKernelGGL(( mul_add_kernel<scalar_t, unroll, true>)
, dim3(grid), dim3(threadsPerBlock), 0, stream, x.data_ptr<scalar_t>(),
mul_add_kernel<scalar_t, unroll, true>
<<<grid, threadsPerBlock, 0, stream>>>(x.data_ptr<scalar_t>(),
nullptr,
bias.data_ptr<scalar_t>(),
(scalar_t)scale_shift,
......@@ -135,12 +134,12 @@ Tensor embedding(Tensor input_id, Tensor lookup) {
auto shapeOut = input_id.shape;
shapeOut.dataExtent.push_back(lookup.shape[-1]);
auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto stream = getCurrentCUDAStream();
Tensor out = Tensor::empty(shapeOut, lookup.scalar_type(), input_id.device());
dispatch(out.scalar_type(), [&]<typename scalar_t>() {
hipLaunchKernelGGL(( EmbeddingKernel), dim3(input_id.numel()), dim3(std::min(lookup.shape[-1], 1024)), 0, stream,
EmbeddingKernel<<<input_id.numel(), std::min(lookup.shape[-1], 1024), 0, stream>>>(
input_id.data_ptr<int32_t>(), out.data_ptr<scalar_t>(), lookup.data_ptr<scalar_t>(), lookup.shape[-1]);
});
......@@ -150,12 +149,12 @@ Tensor embedding(Tensor input_id, Tensor lookup) {
Tensor argmax_sample(Tensor logits) {
assert(logits.ndims() == 2);
auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto stream = getCurrentCUDAStream();
Tensor out = Tensor::empty({logits.shape[0]}, Tensor::INT32, logits.device());
dispatch(logits.scalar_type(), [&]<typename scalar_t>() {
hipLaunchKernelGGL(( argmax_sample_kernel), dim3(logits.shape[0]), dim3(std::min(logits.shape[1], 1024)), 0, stream,
argmax_sample_kernel<<<logits.shape[0], std::min(logits.shape[1], 1024), 0, stream>>>(
logits.data_ptr<scalar_t>(), out.data_ptr<int32_t>(), logits.shape[1]);
});
......@@ -168,7 +167,7 @@ void splitqkv(Tensor qkv, Tensor q, Tensor k, Tensor v) {
// assert(qkv.shape[0] == k.shape[0]);
// assert(qkv.shape[0] == v.shape[0]);
auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto stream = getCurrentCUDAStream();
int dim_q = q.shape[-1] * q.shape[-2];
int dim_k = k.shape[-1] * k.shape[-2];
......@@ -180,7 +179,7 @@ void splitqkv(Tensor qkv, Tensor q, Tensor k, Tensor v) {
int num_tokens = qkv.numel() / qkv.shape[-1];
dispatch(qkv.scalar_type(), [&]<typename scalar_t>() {
hipLaunchKernelGGL(( splitqkv_kernel), dim3(num_tokens), dim3(std::min(qkv.shape[-1], 1024)), 0, stream, qkv.data_ptr<scalar_t>(),
splitqkv_kernel<<<num_tokens, std::min(qkv.shape[-1], 1024), 0, stream>>>(qkv.data_ptr<scalar_t>(),
q.data_ptr<scalar_t>(),
k.data_ptr<scalar_t>(),
v.data_ptr<scalar_t>(),
......@@ -196,7 +195,7 @@ std::array<Tensor, N> split_mod(Tensor input) {
int threadsPerBlock = 1024;
int blocksPerGrid = (input.numel() + threadsPerBlock - 1) / threadsPerBlock;
auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto stream = getCurrentCUDAStream();
auto shapeOut = TensorShape(input.shape.dataExtent);
shapeOut[-1] /= N;
......@@ -211,7 +210,7 @@ std::array<Tensor, N> split_mod(Tensor input) {
for (int k = 0; k < N; k++) {
outPtr[k] = out[k].template data_ptr<scalar_t>();
}
hipLaunchKernelGGL(( split_mod_kernel), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream,
split_mod_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
input.data_ptr<scalar_t>(), outPtr, input.numel());
});
......@@ -228,10 +227,10 @@ Tensor quant_static(Tensor x, float scale) {
int threadsPerBlock = 1024;
int blocksPerGrid = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll);
auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto stream = getCurrentCUDAStream();
dispatch(x.scalar_type(), [&]<typename scalar_t>() {
hipLaunchKernelGGL(( quant_kernel_static<scalar_t, unroll>), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream,
quant_kernel_static<scalar_t, unroll><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
x.data_ptr<scalar_t>(), out.data_ptr<int8_t>(), (scalar_t)scale, x.numel());
});
......@@ -248,10 +247,10 @@ Tensor quant_static_fuse_gelu(Tensor x, float scale) {
int threadsPerBlock = 1024;
int blocksPerGrid = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll);
auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto stream = getCurrentCUDAStream();
dispatch(x.scalar_type(), [&]<typename scalar_t>() {
hipLaunchKernelGGL(( quant_kernel_static_fuse_gelu<scalar_t, unroll>), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream,
quant_kernel_static_fuse_gelu<scalar_t, unroll><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
x.data_ptr<scalar_t>(), out.data_ptr<int8_t>(), (scalar_t)scale, x.numel());
});
......@@ -267,7 +266,7 @@ void cast(Tensor input, Tensor output) {
assert(input.scalar_size() == output.scalar_size());
}
auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto stream = getCurrentCUDAStream();
dispatch(input.scalar_type(), [&]<typename input_t>() {
dispatch(output.scalar_type(), [&]<typename output_t>() {
......@@ -276,10 +275,10 @@ void cast(Tensor input, Tensor output) {
int threadsPerBlock = 1024;
int blocksPerGrid = (int)ceilDiv<int64_t>(input.numel(), threadsPerBlock * unroll);
hipLaunchKernelGGL(( cast_kernel<input_t, output_t, unroll>), dim3(blocksPerGrid), dim3(threadsPerBlock), 0, stream,
cast_kernel<input_t, output_t, unroll><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
input.data_ptr<input_t>(), output.data_ptr<output_t>(), input.numel());
checkCUDA(hipGetLastError());
checkCUDA(cudaGetLastError());
});
});
}
......@@ -299,7 +298,7 @@ Tensor topk(Tensor x, int k) {
Tensor out = Tensor::empty(outShape, Tensor::INT32, x.device());
auto stream = getCurrentHIPStreamMasqueradingAsCUDA();
auto stream = getCurrentCUDAStream();
dispatchVal(k, std::make_integer_sequence<int, MAXK + 1>(), [&]<int K>() {
if constexpr (K == 0) {
......@@ -308,9 +307,9 @@ Tensor topk(Tensor x, int k) {
}
if constexpr (K > 0) {
dispatch(x.scalar_type(), [&]<typename scalar_t>() {
hipLaunchKernelGGL(( topk_kernel<scalar_t, K>), dim3(ceilDiv(batch, 32)), dim3(32), 0, stream,
topk_kernel<scalar_t, K><<<ceilDiv(batch, 32), 32, 0, stream>>>(
x.data_ptr<scalar_t>(), out.data_ptr<int>(), N, x.stride(-2), batch);
checkCUDA(hipGetLastError());
checkCUDA(cudaGetLastError());
});
}
});
......
#include "hip/hip_runtime.h"
#include "reduction_utils.cuh"
#include <array>
#include <hip/hip_fp16.h>
#include <hip/hip_bf16.h>
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#include "utils.cuh"
#include "activation_kernels_impl.cuh"
......
#include "hip/hip_runtime.h"
/*
* Adapted from
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh
......
#include "hip/hip_runtime.h"
// Adated from FasterTransformer,
// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
#pragma once
......@@ -10,10 +9,10 @@
#include <cstdio>
#include <hip/hip_fp16.h>
#include <cuda_fp16.h>
#ifdef ENABLE_BF16
#include <hip/hip_bf16.h>
#include <cuda_bf16.h>
#endif
__device__ __forceinline__ static void trap_unsupported_arch() {
......@@ -25,11 +24,11 @@ __device__ __forceinline__ static void trap_unsupported_arch() {
__trap();
}
#if defined(ENABLE_BF16) && defined(__DTK_ARCH__) && __DTK_ARCH__ < 800
__device__ __forceinline__ static __hip_bfloat162
__hfma2(const __hip_bfloat162 a, const __hip_bfloat162 b, const __hip_bfloat162 c) {
#if defined(ENABLE_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
__device__ __forceinline__ static __nv_bfloat162
__hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) {
trap_unsupported_arch();
return __hip_bfloat162(0.0f, 0.0f);
return __nv_bfloat162(0.0f, 0.0f);
}
#endif
......@@ -57,11 +56,11 @@ struct num_elems<half2> {
};
#ifdef ENABLE_BF16
template<>
struct num_elems<__hip_bfloat16> {
struct num_elems<__nv_bfloat16> {
static constexpr int value = 1;
};
template<>
struct num_elems<__hip_bfloat162> {
struct num_elems<__nv_bfloat162> {
static constexpr int value = 2;
};
#endif
......@@ -108,12 +107,12 @@ struct packed_as<float2, 1> {
};
#ifdef ENABLE_BF16
template<>
struct packed_as<__hip_bfloat16, 2> {
using type = __hip_bfloat162;
struct packed_as<__nv_bfloat16, 2> {
using type = __nv_bfloat162;
};
template<>
struct packed_as<__hip_bfloat162, 1> {
using type = __hip_bfloat16;
struct packed_as<__nv_bfloat162, 1> {
using type = __nv_bfloat16;
};
#endif
#ifdef ENABLE_FP8
......@@ -170,8 +169,8 @@ inline __device__ T ldg(const T *val) {
#define bf1622float2 __bfloat1622float2
#define float22bf162 __float22bfloat162_rn
#define bf162bf162 __bfloat162bfloat162
inline __device__ int16_t bf1622int16(__hip_bfloat162 val) {
#if defined(__DTK_ARCH__) && __DTK_ARCH__ < 800
inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float2 f_val;
f_val.x = max(min(__low2float(val), 127.f), -128.f);
f_val.y = max(min(__high2float(val), 127.f), -128.f);
......@@ -202,8 +201,8 @@ inline __device__ int16_t bf1622int16(__hip_bfloat162 val) {
#if ENABLE_BF16
template<>
inline __device__ __hip_bfloat162 ldg(const __hip_bfloat162 *val) {
#if defined(__DTK_ARCH__) && __DTK_ARCH__ < 800
inline __device__ __nv_bfloat162 ldg(const __nv_bfloat162 *val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return val[0];
#else
return __ldg(val);
......@@ -211,8 +210,8 @@ inline __device__ __hip_bfloat162 ldg(const __hip_bfloat162 *val) {
}
template<>
inline __device__ __hip_bfloat16 ldg(const __hip_bfloat16 *val) {
#if defined(__DTK_ARCH__) && __DTK_ARCH__ < 800
inline __device__ __nv_bfloat16 ldg(const __nv_bfloat16 *val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return val[0];
#else
return __ldg(val);
......@@ -331,81 +330,81 @@ __device__ inline float2 cuda_cast<float2, int16_t>(int16_t val) {
#ifdef ENABLE_BF16
template<>
__device__ inline __hip_bfloat16 cuda_cast(int32_t val) {
__device__ inline __nv_bfloat16 cuda_cast(int32_t val) {
return static_cast<float>(val);
}
template<>
__device__ inline __hip_bfloat16 cuda_cast(int8_t val) {
__device__ inline __nv_bfloat16 cuda_cast(int8_t val) {
return static_cast<float>(val);
}
template<>
__device__ inline int8_t cuda_cast(__hip_bfloat16 val) {
__device__ inline int8_t cuda_cast(__nv_bfloat16 val) {
return static_cast<float>(val);
}
template<>
__device__ inline float cuda_cast<float, __hip_bfloat16>(__hip_bfloat16 val) {
__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
return __bfloat162float(val);
}
template<>
__device__ inline float2 cuda_cast<float2, __hip_bfloat162>(__hip_bfloat162 val) {
__device__ inline float2 cuda_cast<float2, __nv_bfloat162>(__nv_bfloat162 val) {
return bf1622float2(val);
}
template<>
__device__ inline half cuda_cast<half, __hip_bfloat16>(__hip_bfloat16 val) {
__device__ inline half cuda_cast<half, __nv_bfloat16>(__nv_bfloat16 val) {
return __float2half(__bfloat162float(val));
}
template<>
__device__ inline int16_t cuda_cast<int16_t, __hip_bfloat162>(__hip_bfloat162 val) {
__device__ inline int16_t cuda_cast<int16_t, __nv_bfloat162>(__nv_bfloat162 val) {
return bf1622int16(val);
}
template<>
__device__ inline __hip_bfloat16 cuda_cast<__hip_bfloat16, float>(float val) {
__device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, float>(float val) {
return __float2bfloat16(val);
}
template<>
__device__ inline __hip_bfloat16 cuda_cast<__hip_bfloat16, half>(half val) {
__device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, half>(half val) {
return __float2bfloat16(__half2float(val));
}
template<>
__device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, __hip_bfloat16>(__hip_bfloat16 val) {
__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, __nv_bfloat16>(__nv_bfloat16 val) {
return bf162bf162(val);
}
template<>
__device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, float>(float val) {
__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float>(float val) {
return __float2bfloat162_rn(val);
}
template<>
__device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, float2>(float2 val) {
__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float2>(float2 val) {
return float22bf162(val);
}
template<>
__device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, int16_t>(int16_t val) {
__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, int16_t>(int16_t val) {
union {
int8_t int8[2];
int16_t int16;
};
int16 = val;
__hip_bfloat162 res;
res.x = cuda_cast<__hip_bfloat16>(int8[0]);
res.y = cuda_cast<__hip_bfloat16>(int8[1]);
__nv_bfloat162 res;
res.x = cuda_cast<__nv_bfloat16>(int8[0]);
res.y = cuda_cast<__nv_bfloat16>(int8[1]);
return res;
}
template<>
__device__ inline __hip_bfloat162 cuda_cast<__hip_bfloat162, half2>(half2 val) {
__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, half2>(half2 val) {
return float22bf162(__half22float2(val));
}
......@@ -421,7 +420,7 @@ __device__ __forceinline__ packed_as<half, 2>::type f162f162<half>(half x) {
#ifdef ENABLE_BF16
template<>
__device__ __forceinline__ packed_as<__hip_bfloat16, 2>::type f162f162<__hip_bfloat16>(__hip_bfloat16 x) {
__device__ __forceinline__ packed_as<__nv_bfloat16, 2>::type f162f162<__nv_bfloat16>(__nv_bfloat16 x) {
return __bfloat162bfloat162(x);
}
#endif
......@@ -454,8 +453,8 @@ __device__ inline half cuda_max(half2 val) {
#ifdef ENABLE_BF16
template<>
__device__ inline __hip_bfloat16 cuda_max(__hip_bfloat162 val) {
#if (defined(__DTK_ARCH__) && (__DTK_ARCH__ >= 800))
__device__ inline __nv_bfloat16 cuda_max(__nv_bfloat162 val) {
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
return __hmax(val.x, val.y);
#else
assert(false);
......@@ -498,14 +497,14 @@ __device__ inline half2 cuda_abs(half2 val) {
#ifdef ENABLE_BF16
#if __DTK_ARCH__ >= 800 || !defined(__DTK_ARCH__)
#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
template<>
__device__ inline __hip_bfloat16 cuda_abs(__hip_bfloat16 val) {
__device__ inline __nv_bfloat16 cuda_abs(__nv_bfloat16 val) {
return __habs(val);
}
template<>
__device__ inline __hip_bfloat162 cuda_abs(__hip_bfloat162 val) {
__device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) {
return __habs2(val);
}
#endif
......
#include "hip/hip_runtime.h"
#include "zgemm.h"
#include "attention.cuh"
......@@ -72,10 +71,10 @@ void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM]
shmem = std::max(shmem, Attention::template attention_fp16_kernel<Epilogue>::SHMEM_SIZE);
if (shmem >= 24 * 1024) {
checkCUDA(hipFuncSetAttribute(func, hipFuncAttributeMaxDynamicSharedMemorySize, shmem));
checkCUDA(cudaFuncSetAttribute(func, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
}
hipLaunchKernelGGL(( func), dim3(grid), dim3(GEMM::WARP_SIZE * GEMM::NUM_WARPS), shmem, getCurrentHIPStreamMasqueradingAsCUDA(), q.data_ptr<packed_q_t>(),
func<<<grid, GEMM::WARP_SIZE * GEMM::NUM_WARPS, shmem, getCurrentCUDAStream()>>>(q.data_ptr<packed_q_t>(),
k.data_ptr<packed_k_t>(),
v.data_ptr<packed_v_t>(),
scale,
......@@ -83,7 +82,7 @@ void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM]
numTokensKV,
args,
false);
checkCUDA(hipGetLastError());
checkCUDA(cudaGetLastError());
};
launch.template operator()<typename GEMM::EpilogueDefault>(typename GEMM::EpilogueDefault::Arguments{
......
#include "hip/hip_runtime.h"
#pragma once
#include "gemm_base.cuh"
......@@ -27,8 +26,8 @@ struct AttentionFP16Config {
using half_t = half;
using half2_t = half2;
using epilogue_half_t = typename std::conditional_t<bf16out, __hip_bfloat16, half>;
using epilogue_half2_t = typename std::conditional_t<bf16out, __hip_bfloat162, half2>;
using epilogue_half_t = typename std::conditional_t<bf16out, __nv_bfloat16, half>;
using epilogue_half2_t = typename std::conditional_t<bf16out, __nv_bfloat162, half2>;
};
using AttentionFP16Config_FP16 = AttentionFP16Config<false>;
......@@ -61,7 +60,7 @@ public:
using typename AttentionConfig::epilogue_half_t;
using typename AttentionConfig::epilogue_half2_t;
#if defined(__DTK_ARCH__) && __DTK_ARCH__ >= 800
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
static constexpr bool IS_SM80 = true;
#else
static constexpr bool IS_SM80 = false;
......@@ -658,7 +657,7 @@ public:
template<typename Epilogue>
struct attention_fp16_kernel {
static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
static constexpr int SHMEM_SIZE = 0; // sizeof(q_shmem_t);
__device__ void operator()(const packed_q_t *ptr_q,
......
#include "hip/hip_runtime.h"
#pragma once
#include "gemm_base.cuh"
......@@ -703,7 +702,7 @@ public:
// q: [batch_size, #blocks, block_size, #heads, HEAD_DIM]
// vk: [batch_size, #heads, HEAD_DIM+1, HEAD_DIM]
struct vk_mul_q_kernel {
static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
// FIXME FIXME FIXME
__device__ void operator()(half_t *q, const float *vk, float eps, int num_tokens) {
const int block_id = blockIdx.x;
......@@ -763,7 +762,7 @@ public:
template<typename Epilogue>
struct test_epilogue_kernel {
static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
static constexpr size_t SHMEM_PER_WARP =
ceilDiv<size_t>(Base::template load_act_to_fpsum<false>::SHMEM_SIZE, 128) * 128;
static constexpr size_t SHMEM_SIZE = SHMEM_PER_WARP * NUM_WARPS;
......
#include "hip/hip_runtime.h"
#pragma once
#include "common.h"
......@@ -45,8 +44,8 @@ public:
// may generate incorrect results in certain circumstances
static constexpr bool FASTER_I2F = faster_i2f;
using half_t = typename std::conditional_t<bf16, __hip_bfloat16, half>;
using half2_t = typename std::conditional_t<bf16, __hip_bfloat162, half2>;
using half_t = typename std::conditional_t<bf16, __nv_bfloat16, half>;
using half2_t = typename std::conditional_t<bf16, __nv_bfloat162, half2>;
};
using GEMMConfig_W4A4_FP16 = GEMMConfig_W4A4<false>;
......@@ -68,8 +67,8 @@ public:
using half_t = half;
using half2_t = half2;
#else
using half_t = __hip_bfloat16;
using half2_t = __hip_bfloat162;
using half_t = __nv_bfloat16;
using half2_t = __nv_bfloat162;
#endif
};
......@@ -203,9 +202,9 @@ public:
__device__ __forceinline__ static packed_f32psum_t
mma_f16xf16_f32(packed_fpsum_t a, packed_fpsum_t b, packed_f32psum_t psum) {
static_assert(std::is_same_v<half_t, half> || std::is_same_v<half_t, __hip_bfloat16>);
static_assert(std::is_same_v<half_t, half> || std::is_same_v<half_t, __nv_bfloat16>);
static constexpr bool is_bf16 = std::is_same_v<half_t, __hip_bfloat16>;
static constexpr bool is_bf16 = std::is_same_v<half_t, __nv_bfloat16>;
uint4 out1 = mma_m16n8k16_f32f16f16f32<is_bf16>(
kernels::bit_cast<uint4>(a),
......@@ -891,8 +890,8 @@ constexpr int max_arch() {
template<typename kernel, typename... T>
__global__ static void invoke_kernel(T... args) {
#ifdef __DTK_ARCH__
if constexpr (__DTK_ARCH__ >= min_arch<kernel>() && __DTK_ARCH__ <= max_arch<kernel>()) {
#ifdef __CUDA_ARCH__
if constexpr (__CUDA_ARCH__ >= min_arch<kernel>() && __CUDA_ARCH__ <= max_arch<kernel>()) {
kernel()(args...);
} else {
trap_unsupported_arch();
......@@ -917,8 +916,8 @@ template<typename T>
static void test_sizeof() {
printf("typeid = %s\n", typeid(T).name());
test_sizeof_host<T>();
hipLaunchKernelGGL(( test_sizeof_device<T>), dim3(1), dim3(1), 0, 0, );
checkCUDA(hipDeviceSynchronize());
test_sizeof_device<T><<<1, 1>>>();
checkCUDA(cudaDeviceSynchronize());
}
}; // namespace nunchaku::kernels
......@@ -163,7 +163,7 @@ __device__ __forceinline__ static float2 half22float2(half2 val) {
return __half22float2(val);
}
__device__ __forceinline__ static float2 half22float2(__hip_bfloat162 val) {
__device__ __forceinline__ static float2 half22float2(__nv_bfloat162 val) {
return __bfloat1622float2(val);
}
......@@ -176,7 +176,7 @@ __device__ __forceinline__ half2 float22half2<half2>(float2 val) {
}
template<>
__device__ __forceinline__ __hip_bfloat162 float22half2<__hip_bfloat162>(float2 val) {
__device__ __forceinline__ __nv_bfloat162 float22half2<__nv_bfloat162>(float2 val) {
return __float22bfloat162_rn(val);
}
......@@ -334,13 +334,13 @@ __device__ __forceinline__ static half2 h2div(half2 a, half2 b) {
of.y = __fdividef(af.y, bf.y);
return float22half2<half2>(of);
};
__device__ __forceinline__ static __hip_bfloat162 h2div(__hip_bfloat162 a, __hip_bfloat162 b) {
__device__ __forceinline__ static __nv_bfloat162 h2div(__nv_bfloat162 a, __nv_bfloat162 b) {
float2 af = half22float2(a);
float2 bf = half22float2(b);
float2 of;
of.x = __fdividef(af.x, bf.x);
of.y = __fdividef(af.y, bf.y);
return float22half2<__hip_bfloat162>(of);
return float22half2<__nv_bfloat162>(of);
};
__device__ __forceinline__ static void reduce_add(float *addr, float val) {
......
#include "hip/hip_runtime.h"
#pragma once
#include "gemm_base.cuh"
......@@ -26,7 +25,7 @@ public:
// micro-scales for FP4 MMA
// each uint32_t is a 4*32 matrix of scales (for MMA of 64*32)
#if defined(__DTK_ARCH__) && __DTK_ARCH__ >= 1200
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1200
static constexpr bool FP4_AVAILABLE = true;
#else
static constexpr bool FP4_AVAILABLE = false;
......@@ -624,7 +623,7 @@ public:
// each thread block (1 warp) quantize WARP_M * WARP_K tile (32 * 64)
struct quantize_w4a4_act_kernel {
static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
__device__ void operator()(const half_t *input, packed_act_t *output, packed_ascale_t *oscales, int K) {
const int laneId = threadIdx.x % WARP_SIZE;
......@@ -661,7 +660,7 @@ public:
// each thread block (1 warp) quantize WARP_N * WARP_K tile (128 * 64)
struct quantize_w4a4_wgt_kernel {
static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
__device__ void operator()(const half_t *input, packed_wgt_t *output, packed_wscale_t *oscales, int K) {
const int laneId = threadIdx.x % WARP_SIZE;
......@@ -722,9 +721,9 @@ public:
template<bool ACT_UNSIGNED, typename T>
__device__ __forceinline__ static void
compute(act_warp A, wgt_warp W, ascale_warp ascale, wscale_warp wscale, T &fpsum) {
#if defined(__DTK_ARCH__) && __DTK_ARCH__ == 800
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 800
using int2half2 = i2f_sm80;
#elif defined(__DTK_ARCH__) && __DTK_ARCH__ == 750
#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750
using int2half2 = std::conditional_t<Config::FASTER_I2F, i2f_sm75_fast, i2f_sm75>;
;
#else
......@@ -902,7 +901,7 @@ public:
compute<ACT_UNSIGNED>(A[k2], W[k2], ascale[k2], wscale[k2], fpsum);
// #if defined(__DTK_ARCH__) && __DTK_ARCH__ >= 800
// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
if (alwaysfalse) {
dummy = clock();
}
......@@ -1046,7 +1045,7 @@ public:
template<typename Epilogue, bool ACT_UNSIGNED>
struct gemm_w4a4_kernel {
static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
static constexpr int MAX_ARCH = Config::FASTER_I2F ? 750 : INT_MAX; // FASTER_I2F is only needed on sm_75
__device__ void operator()(const packed_act_t *act,
......@@ -1099,7 +1098,7 @@ public:
struct quantize_w4a4_fuse_lora_kernel {
using oscales_t = typename std::conditional_t<use_fp4, packed_amscale_t, packed_ascale_t>;
static constexpr int MIN_ARCH = std::is_same_v<half_t, __hip_bfloat16> ? 800 : 750;
static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
static constexpr size_t SHMEM_PER_WARP =
ceilDiv<size_t>(Base::template load_act_to_fpsum<fuse_glu>::SHMEM_SIZE, 128) * 128;
static constexpr size_t SHMEM_SIZE = SHMEM_PER_WARP * NUM_WARPS;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment