rocm_ops.hip

#include "hip/hip_runtime.h"
#include <stdio.h>
#include <math.h>
__global__ void _Add(long long sz, float* Z, const float* X, const float* Y) {
    long long offset = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    if (offset < sz) {
        Z[offset] = X[offset] + Y[offset];
    }
    return;
}
void rocm_add(int64_t sz, float* Z, const float* X, const float* Y, hipStream_t compute_stream) {
    float *d_X, *d_Y, *d_Z;
    // 为设备上的数组分配内存并检查分配结果
    hipError_t err;
    err = hipMalloc((void**)&d_X, sz * sizeof(float));
    if (err != hipSuccess) {
        fprintf(stderr, "hipMalloc for d_X failed: %s\n", hipGetErrorString(err));
        return;
    }
    err = hipMalloc((void**)&d_Y, sz * sizeof(float));
    if (err != hipSuccess) {
        fprintf(stderr, "hipMalloc for d_Y failed: %s\n", hipGetErrorString(err));
        hipFree(d_X);
        return;
    }
    err = hipMalloc((void**)&d_Z, sz * sizeof(float));
    if (err != hipSuccess) {
        fprintf(stderr, "hipMalloc for d_Z failed: %s\n", hipGetErrorString(err));
        hipFree(d_X);
        hipFree(d_Y);
        return;
    }

    // 将主机上的 X 和 Y 数组数据复制到设备
    err = hipMemcpyAsync(d_X, X, sz * sizeof(float), hipMemcpyHostToDevice, compute_stream);
    if (err != hipSuccess) {
        fprintf(stderr, "hipMemcpyAsync for d_X failed: %s\n", hipGetErrorString(err));
        hipFree(d_X);
        hipFree(d_Y);
        hipFree(d_Z);
        return;
    }
    err = hipMemcpyAsync(d_Y, Y, sz * sizeof(float), hipMemcpyHostToDevice, compute_stream);
    if (err != hipSuccess) {
        fprintf(stderr, "hipMemcpyAsync for d_Y failed: %s\n", hipGetErrorString(err));
        hipFree(d_X);
        hipFree(d_Y);
        hipFree(d_Z);
        return;
    }

    // 调用核函数
    _Add<<<256, 256, 0, compute_stream>>>(static_cast<long long>(sz), d_Z, d_X, d_Y);
    err = hipGetLastError();
    if (err != hipSuccess) {
        fprintf(stderr, "Kernel launch failed: %s\n", hipGetErrorString(err));
        hipFree(d_X);
        hipFree(d_Y);
        hipFree(d_Z);
        return;
    }

    // 将计算结果从设备复制回主机
    err = hipMemcpyAsync(Z, d_Z, sz * sizeof(float), hipMemcpyDeviceToHost, compute_stream);
    if (err != hipSuccess) {
        fprintf(stderr, "hipMemcpyAsync for Z failed: %s\n", hipGetErrorString(err));
        hipFree(d_X);
        hipFree(d_Y);
        hipFree(d_Z);
        return;
    }

    // 同步流，确保所有操作完成
    err = hipStreamSynchronize(compute_stream);
    if (err != hipSuccess) {
        fprintf(stderr, "hipStreamSynchronize failed: %s\n", hipGetErrorString(err));
        hipFree(d_X);
        hipFree(d_Y);
        hipFree(d_Z);
        return;
    }

    // 释放设备上的内存
    hipFree(d_X);
    hipFree(d_Y);
    hipFree(d_Z);
}


// Concat
__global__ void _Concat2D(int axis,
                          int M1, int N1, const float* X1,
                          int M2, int N2, const float* X2,
                          float* Z) {
    int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

    if (axis == 0) { // 按行连接
        if (row < M1 && col < N1) {
            Z[row * N1 + col] = X1[row * N1 + col];
        } else if (row >= M1 && row < M1 + M2 && col < N2) {
            Z[row * N2 + col] = X2[(row - M1) * N2 + col];
        }
    } else if (axis == 1) { // 按列连接
        if (row < M1 && col < N1) {
            Z[row * (N1 + N2) + col] = X1[row * N1 + col];
        } else if (row < M2 && col >= N1 && col < N1 + N2) {
            Z[row * (N1 + N2) + col] = X2[row * N2 + (col - N1)];
        }
    }
    return;
}
void rocm_concat(int axis,
                 int M1, int N1, const float* X1,
                 int M2, int N2, const float* X2,
                 float* Z,
                 hipStream_t compute_stream) {
    dim3 blockDim(16, 16);
    dim3 gridDim((axis == 0 ? N1 : N1 + N2 + 15) / 16, (axis == 0 ? M1 + M2 : M1 + 15) / 16);
    float *d_X1, *d_X2, *d_Z;
    hipError_t err;

    size_t size1 = M1 * N1 * sizeof(float);
    size_t size2 = M2 * N2 * sizeof(float);
    size_t sizeZ = (axis == 0 ? (M1 + M2) * N1 : M1 * (N1 + N2)) * sizeof(float);

    // 分配显存
    err = hipMalloc(&d_X1, size1); if (err != hipSuccess) { /* 错误处理 */ }
    err = hipMalloc(&d_X2, size2); if (err != hipSuccess) { hipFree(d_X1); return; }
    err = hipMalloc(&d_Z, sizeZ); if (err != hipSuccess) { hipFree(d_X1); hipFree(d_X2); return; }

    // 拷贝数据到设备
    hipMemcpyAsync(d_X1, X1, size1, hipMemcpyHostToDevice, compute_stream);
    hipMemcpyAsync(d_X2, X2, size2, hipMemcpyHostToDevice, compute_stream);

    // 启动核函数
    // dim3 blockDim(16, 16);
    // dim3 gridDim((axis == 0 ? N1 : N1 + N2 + 15) / 16, (axis == 0 ? M1 + M2 : M1 + 15) / 16);
    _Concat2D<<<gridDim, blockDim, 0, compute_stream>>>(axis, M1, N1, d_X1, M2, N2, d_X2, d_Z);

    // 拷贝结果回主机
    hipMemcpyAsync(Z, d_Z, sizeZ, hipMemcpyDeviceToHost, compute_stream);

    // 同步流
    hipStreamSynchronize(compute_stream);

    // 释放资源
    hipFree(d_X1); hipFree(d_X2); hipFree(d_Z);
    return;
}

//gemm
#include <hip/hip_runtime.h>

__global__ void _Gemm(bool transA, bool transB,
                      int M, int N, int K,
                      float alpha,
                      const float* A,
                      const float* B,
                      float beta,
                      float* C) {
    int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
    int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

    if (row >= M || col >= N) return;

    float sum = 0.0f;

    for (int k = 0; k < K; ++k) {
        float a = transA ? A[k * M + row] : A[row * K + k];
        float b = transB ? B[col * K + k] : B[k * N + col];
        sum += a * b;
    }

    C[row * N + col] = alpha * sum + beta * C[row * N + col];
return;
}

void rocm_gemm(bool transA, bool transB,
               int M, int N, int K,
               float alpha,
               const float* A,
               const float* B,
               float beta,
               float* C,
               hipStream_t compute_stream) {
    dim3 blockDim(16, 16);
    dim3 gridDim((N + 15) / 16, (M + 15) / 16);
    float *d_A, *d_B, *d_C;
    hipError_t err;

    size_t sizeA = transA ? K * M * sizeof(float) : M * K * sizeof(float);
    size_t sizeB = transB ? N * K * sizeof(float) : K * N * sizeof(float);
    size_t sizeC = M * N * sizeof(float);

    // 分配显存
    err = hipMalloc(&d_A, sizeA); if (err != hipSuccess) { goto error; }
    err = hipMalloc(&d_B, sizeB); if (err != hipSuccess) { hipFree(d_A); goto error; }
    err = hipMalloc(&d_C, sizeC); if (err != hipSuccess) { hipFree(d_A); hipFree(d_B); goto error; }

    // 主机 -> 设备拷贝
    hipMemcpyAsync(d_A, A, sizeA, hipMemcpyHostToDevice, compute_stream);
    hipMemcpyAsync(d_B, B, sizeB, hipMemcpyHostToDevice, compute_stream);
    hipMemcpyAsync(d_C, C, sizeC, hipMemcpyHostToDevice, compute_stream);

    // 启动核函数
    //dim3 blockDim(16, 16);
    //dim3 gridDim((N + 15) / 16, (M + 15) / 16);
    _Gemm<<<gridDim, blockDim, 0, compute_stream>>>(transA, transB, M, N, K, alpha, d_A, d_B, beta, d_C);

    // 设备 -> 主机拷贝
    hipMemcpyAsync(C, d_C, sizeC, hipMemcpyDeviceToHost, compute_stream);

    // 同步流
    hipStreamSynchronize(compute_stream);

    // 清理资源
    hipFree(d_A);
    hipFree(d_B);
    hipFree(d_C);
    return;

error:
    fprintf(stderr, "HIP memory allocation or memcpy failed in rocm_gemm\n");
    if (d_A) hipFree(d_A);
    if (d_B) hipFree(d_B);
    if (d_C) hipFree(d_C);
}
//GroupNormalization
#include <math.h>

__global__ void _GroupNorm(
    int64_t N, int64_t C, int64_t H, int64_t W, int64_t G,
    float eps, const float* X, float* Y,
    const float* gamma, const float* beta
) {
    // 计算当前组和样本索引
    int64_t group_idx = hipBlockIdx_x;
    int64_t n = hipBlockIdx_y;
    int64_t channels_per_group = C / G;
    int64_t c_start = group_idx * channels_per_group;
    int64_t c_end = c_start + channels_per_group;

    // 组内总元素数
    int64_t group_size = channels_per_group * H * W;

    // 共享内存用于归约求和
    __shared__ float shared_sum[256];
    __shared__ float shared_sum_sq[256];

    // 每个线程计算局部和与平方和
    float sum = 0.0f, sum_sq = 0.0f;
    for (int64_t idx = hipThreadIdx_x; idx < group_size; idx += hipBlockDim_x) {
        int64_t c = c_start + idx / (H * W);
        int64_t hw = idx % (H * W);
        int64_t h = hw / W;
        int64_t w = hw % W;
        int64_t linear_idx = n * C * H * W + c * H * W + h * W + w;
        float val = X[linear_idx];
        sum += val;
        sum_sq += val * val;
    }
    shared_sum[hipThreadIdx_x] = sum;
    shared_sum_sq[hipThreadIdx_x] = sum_sq;
    __syncthreads();

    // 树状归约求全局和
    for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
        if (hipThreadIdx_x < s) {
            shared_sum[hipThreadIdx_x] += shared_sum[hipThreadIdx_x + s];
            shared_sum_sq[hipThreadIdx_x] += shared_sum_sq[hipThreadIdx_x + s];
        }
        __syncthreads();
    }

    // 计算均值和方差
    float mean = shared_sum[0] / group_size;
    float var = shared_sum_sq[0] / group_size - mean * mean;

    // 归一化并应用仿射变换
    for (int64_t idx = hipThreadIdx_x; idx < group_size; idx += hipBlockDim_x) {
        int64_t c = c_start + idx / (H * W);
        int64_t hw = idx % (H * W);
        int64_t h = hw / W;
        int64_t w = hw % W;
        int64_t linear_idx = n * C * H * W + c * H * W + h * W + w;
        float val = (X[linear_idx] - mean) / sqrtf(var + eps);
        Y[linear_idx] = gamma[c] * val + beta[c];
    }
    return;
}

void rocm_group_norm(
    int64_t N, int64_t C, int64_t H, int64_t W, int64_t G,
    float eps, const float* X, float* Y,
    const float* gamma, const float* beta,
    hipStream_t compute_stream
) {
	 dim3 block_dim(256);  // 每个块256线程
    dim3 grid_dim(G, N);  // 每个组和样本对应一个块
    // 参数校验
    if (C % G != 0) {
        fprintf(stderr, "Error: Channels must be divisible by groups.\n");
        return;
    }

    // 分配设备内存
    float *d_X, *d_Y, *d_gamma, *d_beta;
    hipError_t err;

    size_t input_size = N * C * H * W * sizeof(float);
    size_t param_size = C * sizeof(float);

    err = hipMalloc(&d_X, input_size);
    if (err != hipSuccess) { /* 处理错误 */ }

    err = hipMalloc(&d_Y, input_size);
    if (err != hipSuccess) { hipFree(d_X); return; }

    err = hipMalloc(&d_gamma, param_size);
    if (err != hipSuccess) { hipFree(d_X); hipFree(d_Y); return; }

    err = hipMalloc(&d_beta, param_size);
    if (err != hipSuccess) { hipFree(d_X); hipFree(d_Y); hipFree(d_gamma); return; }

    // 数据拷贝到设备
    hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, compute_stream);
    hipMemcpyAsync(d_gamma, gamma, param_size, hipMemcpyHostToDevice, compute_stream);
    hipMemcpyAsync(d_beta, beta, param_size, hipMemcpyHostToDevice, compute_stream);
    // 配置核函数参数
    // dim3 block_dim(256);  // 每个块256线程
    // dim3 grid_dim(G, N);  // 每个组和样本对应一个块

    // 启动核函数
    _GroupNorm<<<grid_dim, block_dim, 0, compute_stream>>>(
        N, C, H, W, G, eps, d_X, d_Y, d_gamma, d_beta
    );

    // 拷贝结果回主机
    hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, compute_stream);

    // 同步流并释放资源
    hipStreamSynchronize(compute_stream);
    hipFree(d_X); hipFree(d_Y); hipFree(d_gamma); hipFree(d_beta);
}

//LogSoftmax

__global__ void _LogSoftmax(int64_t N, int64_t D, const float* X, float* Y) {
    int64_t n = hipBlockIdx_x;  // 每个样本一个线程块
    int tid = hipThreadIdx_x;

    // 共享内存存储最大值和指数和
    __shared__ float shared_max[256];
    __shared__ float shared_sum[256];

    // 步骤1：计算样本内最大值
    float max_val = -INFINITY;
    for (int64_t i = tid; i < D; i += hipBlockDim_x) {
        max_val = fmaxf(max_val, X[n * D + i]);
    }
    shared_max[tid] = max_val;
    __syncthreads();

    // 归约求全局最大值
    for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
        if (tid < s && shared_max[tid + s] > shared_max[tid]) {
            shared_max[tid] = shared_max[tid + s];
        }
        __syncthreads();
    }
    float global_max = shared_max[0];

    // 步骤2：计算指数和
    float exp_sum = 0.0f;
    for (int64_t i = tid; i < D; i += hipBlockDim_x) {
        exp_sum += expf(X[n * D + i] - global_max);
    }
    shared_sum[tid] = exp_sum;
    __syncthreads();

    // 归约求全局指数和
    for (int s = hipBlockDim_x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            shared_sum[tid] += shared_sum[tid + s];
        }
        __syncthreads();
    }
    float global_sum = shared_sum[0];

    // 步骤3：计算LogSoftmax
    for (int64_t i = tid; i < D; i += hipBlockDim_x) {
        Y[n * D + i] = (X[n * D + i] - global_max) - logf(global_sum);
    }
    return;
}

void rocm_log_softmax(
    int64_t N, int64_t D, const float* X, float* Y, hipStream_t compute_stream
) {
	 dim3 block_dim(256);  // 每个块256线程
    dim3 grid_dim(N);     // 每个样本一个线程块
    // 分配设备内存
    float *d_X, *d_Y;
    hipError_t err;

    size_t input_size = N * D * sizeof(float);

    err = hipMalloc(&d_X, input_size);
    if (err != hipSuccess) { /* 处理错误 */ }

    err = hipMalloc(&d_Y, input_size);
    if (err != hipSuccess) { hipFree(d_X); return; }

    // 数据拷贝到设备
    hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, compute_stream);

    // 配置核函数参数
    //dim3 block_dim(256);  // 每个块256线程
    //dim3 grid_dim(N);     // 每个样本一个线程块

    // 启动核函数
    _LogSoftmax<<<grid_dim, block_dim, 0, compute_stream>>>(N, D, d_X, d_Y);

    // 拷贝结果回主机
    hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, compute_stream);

    // 同步流并释放资源
    hipStreamSynchronize(compute_stream);
    hipFree(d_X); hipFree(d_Y);
}

//attention
__global__ void _DotProductAttention(int B, int S, int H,
                                     const float* Q, const float* K, const float* V,
                                     float scaling,
                                     float* output) {
    int b = blockIdx.z;
    int i = blockIdx.y * blockDim.y + threadIdx.y; // query index
    int j = blockIdx.x * blockDim.x + threadIdx.x; // hidden dim

    if (b >= B || i >= S || j >= H) return;

    // 计算 Q·K^T[i, k]
    float scores[128]; // 假设 seq_len <= 128
    for (int k = 0; k < S; ++k) {
        float dot = 0.f;
        for (int h = 0; h < H; ++h) {
            dot += Q[(b * S + i) * H + h] * K[(b * S + k) * H + h];
        }
        scores[k] = dot / scaling;
    }

    // softmax over scores
    float max_val = scores[0];
    for (int k = 1; k < S; ++k) max_val = fmaxf(max_val, scores[k]);
    float sum = 0.f;
    for (int k = 0; k < S; ++k) {
        scores[k] = expf(scores[k] - max_val);
        sum += scores[k];
    }
    for (int k = 0; k < S; ++k) scores[k] /= sum;

    // output = softmax * V
    float result = 0.f;
    for (int k = 0; k < S; ++k) {
        result += scores[k] * V[(b * S + k) * H + j];
    }

    output[(b * S + i) * H + j] = result;
}
extern "C" void rocm_attention(int B, int S, int H,
                            const float* Q, const float* K, const float* V,
                            float* Out, hipStream_t stream) {
    dim3 blockDim(16, 16);
    dim3 gridDim((H + 15) / 16, (S + 15) / 16, B);
    float *d_Q, *d_K, *d_V, *d_Out;
    size_t size = B * S * H * sizeof(float);

    hipMalloc(&d_Q, size);
    hipMalloc(&d_K, size);
    hipMalloc(&d_V, size);
    hipMalloc(&d_Out, size);

    hipMemcpyAsync(d_Q, Q, size, hipMemcpyHostToDevice, stream);
    hipMemcpyAsync(d_K, K, size, hipMemcpyHostToDevice, stream);
    hipMemcpyAsync(d_V, V, size, hipMemcpyHostToDevice, stream);

    float scale = sqrtf((float)H);

    // dim3 blockDim(16, 16);
    // dim3 gridDim((H + 15) / 16, (S + 15) / 16, B);
    _DotProductAttention<<<gridDim, blockDim, 0, stream>>>(B, S, H, d_Q, d_K, d_V, scale, d_Out);

    hipMemcpyAsync(Out, d_Out, size, hipMemcpyDeviceToHost, stream);
    hipStreamSynchronize(stream);

    hipFree(d_Q); hipFree(d_K); hipFree(d_V); hipFree(d_Out);
    return;
}


// BatchNormalization
__global__ void _BatchNormalization(
    int N, int C, int H, int W,
    const float* X,
    const float* gamma,
    const float* beta,
    const float* mean,
    const float* var,
    float epsilon,
    float* Y) {

  // global thread index
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int total = N * C * H * W;
  if (idx >= total) return;

  // 计算坐标
  int w = idx % W;
  int tmp = idx / W;
  int h = tmp % H;
  tmp = tmp / H;
  int c = tmp % C;
  int n = tmp / C;

  // 计算 Y = gamma[c] * (X - mean[c]) / sqrt(var[c] + eps) + beta[c]
  int offset = ((n * C + c) * H + h) * W + w;
  float x = X[offset];
  float m = mean[c];
  float v = var[c];
  float inv_std = rsqrtf(v + epsilon);
  Y[offset] = gamma[c] * ((x - m) * inv_std) + beta[c];
}

// host API：rocm_batch_norm
extern "C" void rocm_batch_norm(
    int64_t N, int64_t C, int64_t H, int64_t W,
    const float* X,
    const float* gamma,
    const float* beta,
    const float* mean,
    const float* var,
    float epsilon,
    float* Y,
    hipStream_t stream) {

  size_t total = (size_t)N * C * H * W;
  // 分配并拷贝 X、gamma、beta、mean、var 到设备
  float *d_X, *d_gamma, *d_beta, *d_mean, *d_var, *d_Y;
  hipMalloc(&d_X, total * sizeof(float));
  hipMalloc(&d_Y, total * sizeof(float));
  hipMalloc(&d_gamma, C * sizeof(float));
  hipMalloc(&d_beta, C * sizeof(float));
  hipMalloc(&d_mean, C * sizeof(float));
  hipMalloc(&d_var, C * sizeof(float));

  hipMemcpyAsync(d_X, X, total * sizeof(float), hipMemcpyHostToDevice, stream);
  hipMemcpyAsync(d_gamma, gamma, C * sizeof(float), hipMemcpyHostToDevice, stream);
  hipMemcpyAsync(d_beta, beta, C * sizeof(float), hipMemcpyHostToDevice, stream);
  hipMemcpyAsync(d_mean, mean, C * sizeof(float), hipMemcpyHostToDevice, stream);
  hipMemcpyAsync(d_var, var, C * sizeof(float), hipMemcpyHostToDevice, stream);

  // 启动核函数：一维线程组织
  int threads = 256;
  int blocks = (total + threads - 1) / threads;
  _BatchNormalization<<<blocks, threads, 0, stream>>>(
      N, C, H, W,
      d_X, d_gamma, d_beta, d_mean, d_var,
      epsilon,
      d_Y);

  // 拷回结果
  hipMemcpyAsync(Y, d_Y, total * sizeof(float), hipMemcpyDeviceToHost, stream);
  hipStreamSynchronize(stream);

  // 释放设备内存
  hipFree(d_X);
  hipFree(d_Y);
  hipFree(d_gamma);
  hipFree(d_beta);
  hipFree(d_mean);
  hipFree(d_var);
  return;
}

// Cast Operator: float to int32
// Device kernel: cast each element
__global__ void _Cast(
    int total,
    const float* X,
    int* Y) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx >= total) return;
  // cast float to int32
  Y[idx] = static_cast<int>(X[idx]);
}

// Host API: rocm_cast
extern "C" void rocm_cast(
    int64_t N, int64_t C, int64_t H, int64_t W,
    const float* X,
    int* Y,
    hipStream_t stream) {

  // total elements
  size_t total = (size_t)N * C * H * W;

  // allocate device memory
  float* d_X;
  int* d_Y;
  hipMalloc(&d_X, total * sizeof(float));
  hipMalloc(&d_Y, total * sizeof(int));

  // copy input to device
  hipMemcpyAsync(d_X, X, total * sizeof(float), hipMemcpyHostToDevice, stream);

  // launch kernel
  int threads = 256;
  int blocks = (total + threads - 1) / threads;
  _Cast<<<blocks, threads, 0, stream>>>(
      total,
      d_X,
      d_Y);

  // copy result back
  hipMemcpyAsync(Y, d_Y, total * sizeof(int), hipMemcpyDeviceToHost, stream);
  hipStreamSynchronize(stream);

  // free device memory
  hipFree(d_X);
  hipFree(d_Y);
  return;
}
extern "C" __global__
void SoftmaxKernel(const float* X, float* Y, int M, int N) {
    // M = batch_size, N = feature_size
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= M) return;

    const float* x_row = X + row * N;
    float*       y_row = Y + row * N;

    // 1) 找到这一行的最大值，用于数值稳定性
    float m = x_row[0];
    for (int j = 1; j < N; ++j) {
        m = fmaxf(m, x_row[j]);
    }

    // 2) 计算 exp(x - m) 并累加
    float sum = 0.f;
    for (int j = 0; j < N; ++j) {
        float e = expf(x_row[j] - m);
        y_row[j] = e;
        sum += e;
    }

    // 3) 归一化
    for (int j = 0; j < N; ++j) {
        y_row[j] /= sum;
    }
}

// 这个函数由 ONNX Runtime 调用，替代原来的 rocm_add
extern "C"
void rocm_softmax(int64_t M, int64_t N,
                  const float* X, float* Y,
                  hipStream_t stream) {
    // 每个线程处理一行，线程块大小 128
    const int threads = 128;
    const int blocks  = static_cast<int>((M + threads - 1) / threads);

    hipLaunchKernelGGL(
        SoftmaxKernel,
        dim3(blocks), dim3(threads),
        0,      // shared mem
        stream, // hip stream
        X, Y, static_cast<int>(M), static_cast<int>(N)
    );
    return;
}

template <typename T>
__global__ void _CeluKernel(const T* X, T* Y, int64_t size, T alpha) {
    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        T v = X[idx];
        T pos = v > T(0) ? v : T(0);
        T neg = v <= T(0) ? alpha * (exp(v / alpha) - T(1)) : T(0);
        Y[idx] = pos + neg;
    }
    return;
}

extern "C" void rocm_celu(int64_t size,
                           const float* X,
                           float* Y,
                           float alpha,
                           hipStream_t stream) {
    float *d_X, *d_Y;
    hipMalloc(&d_X, size * sizeof(float));
    hipMalloc(&d_Y, size * sizeof(float));
    hipMemcpyAsync(d_X, X, size * sizeof(float), hipMemcpyHostToDevice, stream);
    int threads = 256;
    int blocks = (size + threads - 1) / threads;
    _CeluKernel<float><<<blocks, threads, 0, stream>>>(d_X, d_Y, size, alpha);
    hipMemcpyAsync(Y, d_Y, size * sizeof(float), hipMemcpyDeviceToHost, stream);
    hipStreamSynchronize(stream);
    hipFree(d_X); hipFree(d_Y);
    return;
}


//relu
template <typename T>
__global__ void _rocm_relu_kernel(float* input, float* output, int64_t size) {
    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= size) return;

    output[idx] = fmaxf(0.0f, input[idx]);
}
extern "C" void rocm_relu(
    int64_t size,
    const float* X,
    float* Y,
    hipStream_t stream
) {
    size_t input_size = size * sizeof(float);
    float *d_X, *d_Y;

    hipMalloc(&d_X, input_size);
    hipMalloc(&d_Y, input_size);

    hipMemcpyAsync(d_X, X, input_size, hipMemcpyHostToDevice, stream);

    int threads = 256;
    int blocks = (size + threads - 1) / threads;

    _rocm_relu_kernel<float><<<blocks, threads, 0, stream>>>(d_X, d_Y, size);

    hipMemcpyAsync(Y, d_Y, input_size, hipMemcpyDeviceToHost, stream);
    hipStreamSynchronize(stream);

    hipFree(d_X);
    hipFree(d_Y);
    
    return;
}

// -------------------------------
// TopK
// -------------------------------
extern "C" __global__
void TopKKernel(
    const float* __restrict__ X,   // [M * N]
    float* __restrict__ values,    // [M * K]
    int64_t* __restrict__ indices, // [M * K]
    int M,
    int N,
    int K
) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= M) return;

    const float* x_row = X + size_t(row) * N;
    float*       v_row = values + size_t(row) * K;
    int64_t*     i_row = indices + size_t(row) * K;

    // 动态共享内存布局：前 K 个 float 存放 topK 值，后 K 个 int 存放对应索引
    extern __shared__ char smem[];
    float* shared_vals = (float*)smem;
    int*   shared_idx  = (int*)(smem + K * sizeof(float));

    // 初始化：shared_vals = -INF, shared_idx = -1
    for (int t = threadIdx.x; t < K; t += blockDim.x) {
        shared_vals[t] = -INFINITY;
        shared_idx[t]  = -1;
    }
    __syncthreads();

    // 扫描整行，维护一个长度为 K 的最小堆逻辑（但这里用简化的线性扫描替代堆）
    for (int j = 0; j < N; ++j) {
        float v = x_row[j];
        // 找当前最小值位置
        float min_val = shared_vals[0];
        int   min_pos = 0;
        for (int t = 1; t < K; ++t) {
            if (shared_vals[t] < min_val) {
                min_val = shared_vals[t];
                min_pos = t;
            }
        }
        // 替换
        if (v > min_val) {
            shared_vals[min_pos] = v;
            shared_idx[min_pos]  = j;
        }
    }
    __syncthreads();

    // 对这 K 个元素做简单排序（降序），K 通常比较小
    for (int i = 0; i < K; ++i) {
        for (int j = i + 1; j < K; ++j) {
            if (shared_vals[j] > shared_vals[i]) {
                // swap value
                float tv = shared_vals[i];
                shared_vals[i] = shared_vals[j];
                shared_vals[j] = tv;
                // swap idx
                int ti = shared_idx[i];
                shared_idx[i] = shared_idx[j];
                shared_idx[j] = ti;
            }
        }
    }
    __syncthreads();

    // 写回全局内存
    for (int t = threadIdx.x; t < K; t += blockDim.x) {
        v_row[t] = shared_vals[t];
        i_row[t] = (int64_t)shared_idx[t];
    }
}

extern "C"
void rocm_topk(
    int64_t M,
    int64_t N,
    int64_t K,
    const float* X,
    float* values,
    int64_t* indices,
    hipStream_t stream
) {
    // 分配设备内存
    size_t sizeX      = size_t(M) * N * sizeof(float);
    size_t sizeOutVal = size_t(M) * K * sizeof(float);
    size_t sizeOutIdx = size_t(M) * K * sizeof(int64_t);
    float*   d_X;
    float*   d_vals;
    int64_t* d_idx;
    if (hipMalloc(&d_X,   sizeX     ) != hipSuccess ||
        hipMalloc(&d_vals,sizeOutVal) != hipSuccess ||
        hipMalloc(&d_idx, sizeOutIdx) != hipSuccess) {
        fprintf(stderr, "HIP malloc failed in rocm_topk\n");
        if (d_X)    hipFree(d_X);
        if (d_vals) hipFree(d_vals);
        if (d_idx)  hipFree(d_idx);
        return;
    }

    // 拷贝输入到设备
    hipMemcpyAsync(d_X, X, sizeX, hipMemcpyHostToDevice, stream);

    // 启动 Kernel：每个线程处理一行，动态共享内存大小 = K*(sizeof(float)+sizeof(int))
    dim3 blockDim(128);
    dim3 gridDim((M + blockDim.x - 1) / blockDim.x);
    size_t shared_bytes = K * (sizeof(float) + sizeof(int));
    hipLaunchKernelGGL(
        TopKKernel,
        gridDim, blockDim, shared_bytes, stream,
        d_X, d_vals, d_idx,
        int(M), int(N), int(K)
    );

    // 拷贝结果回主机
    hipMemcpyAsync(values,  d_vals,  sizeOutVal, hipMemcpyDeviceToHost, stream);
    hipMemcpyAsync(indices, d_idx,   sizeOutIdx, hipMemcpyDeviceToHost, stream);
    hipStreamSynchronize(stream);

    // 释放
    hipFree(d_X);
    hipFree(d_vals);
    hipFree(d_idx);
}