update uni-fold

a1c29028 · zhangqha · a1c29028 · a1c29028 · a1c29028 · a1c29028
Commit a1c29028 authored Apr 17, 2023 by zhangqha
20 changed files
--- a/Uni-Core-main/csrc/adam/adam_kernel.hip
+++ b/Uni-Core-main/csrc/adam/adam_kernel.hip
+// !!! This is a file automatically generated by hipify!!!
+#include "ATen/ATen.h"
+#include "ATen/hip/HIPContext.h"
+#include "ATen/hip/detail/IndexUtils.cuh"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <stdio.h>
+#include <cmath>
+#include "ATen/TensorUtils.h"
+#include "ATen/AccumulateType.h"
+#include <ATen/hip/Exceptions.h>
+
+#include "type_shim_hip.h"
+
+template <typename T, typename GRAD_T>
+__global__ void adam_cuda_kernel(
+    GRAD_T* __restrict__ p,
+    T* __restrict__ m,
+    T* __restrict__ v,
+    const GRAD_T * __restrict__ g,
+    const float b1,
+    const float b2,
+    const float eps,
+    const float grad_scale,
+    const float step_size,
+    const size_t tsize,
+    const float decay_size)
+{
+    //Assuming 2D grids and 2D blocks
+    const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
+    const int threadsPerBlock = blockDim.x * blockDim.y;
+    const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
+    const int i = (blockId * threadsPerBlock + threadIdInBlock);
+    const int totThreads = gridDim.x*gridDim.y*threadsPerBlock;
+
+    for (int j = i; j < tsize; j+=totThreads) {
+        // weight decay
+        T cur_p = (T)p[j] * decay_size;
+        T scaled_grad = static_cast<T>(g[j]) / grad_scale;
+        m[j] = b1*m[j] + (1-b1)*scaled_grad;
+        v[j] = b2*v[j] + (1-b2)*scaled_grad*scaled_grad;
+        const float update = m[j] / (sqrtf(v[j]) + eps);
+        p[j] = cur_p - (step_size*update);
+    }
+}
+
+void fused_adam_cuda(
+    at::Tensor & p,
+    at::Tensor & m,
+    at::Tensor & v,
+    at::Tensor & g,
+    float lr,
+    float beta1,
+    float beta2,
+    float eps,
+    float grad_scale,
+    int step,
+    int bias_correction,
+    float decay)
+{
+    //Get tensor size
+    int tsize = p.numel();
+    //Determine #threads and #blocks
+    const int threadsPerBlock = 512;
+    const dim3 blocks((tsize+threadsPerBlock-1)/threadsPerBlock);
+    AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(p), "parameter tensor is too large to be indexed with int32");
+    //Constants
+    float step_size = lr;
+    if (bias_correction == 1) {
+        const double bias_correction1 = 1.0 - ::pow(static_cast<double>(beta1), step);
+        const double bias_correction2 = 1.0 - ::pow(static_cast<double>(beta2), step);
+        step_size = static_cast<float>(lr * std::sqrt(bias_correction2) / bias_correction1);
+    }
+    float decay_size = 1.0;
+    if (decay != 0.0) {
+        decay_size = 1.0 - step_size * decay;
+    }
+    hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+
+    if (g.scalar_type() == at::ScalarType::Half || g.scalar_type() == at::ScalarType::BFloat16) {
+        AT_ASSERTM(p.scalar_type() == g.scalar_type(), "expected parameter to be the same type as grad");
+        using namespace at; // prevents "toString is undefined" errors
+        DISPATCH_FLOAT_AND_HALF_AND_BF16(g.scalar_type(), 0, "adam_cuda_kernel",
+            using accscalar_t = at::acc_type<scalar_t_0, true>;
+           hipLaunchKernelGGL(( adam_cuda_kernel<accscalar_t, scalar_t_0>), dim3(blocks),dim3(threadsPerBlock), 0, stream, 
+                    p.data_ptr<scalar_t_0>(),
+                    m.data_ptr<accscalar_t>(),
+                    v.data_ptr<accscalar_t>(),
+                    g.data_ptr<scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    tsize,
+                    decay_size);
+            );
+    } else {
+        using namespace at;
+        DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
+           hipLaunchKernelGGL(( adam_cuda_kernel<scalar_t_0, scalar_t_0>), dim3(blocks),dim3(threadsPerBlock), 0, stream, 
+                    p.data_ptr<scalar_t_0>(),
+                    m.data_ptr<scalar_t_0>(),
+                    v.data_ptr<scalar_t_0>(),
+                    g.data_ptr<scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    tsize,
+                    decay_size);
+        );
+    }
+    AT_CUDA_CHECK(hipGetLastError());
+}
--- a/Uni-Core-main/csrc/adam/interface.cpp
+++ b/Uni-Core-main/csrc/adam/interface.cpp
+#include <torch/extension.h>
+
+void fused_adam_cuda(at::Tensor & p, at::Tensor & m, at::Tensor & v, at::Tensor & g, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int bias_correction, float decay);
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+void adam(at::Tensor & p, at::Tensor & m, at::Tensor & v, at::Tensor & g, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int bias_correction, float decay) {
+    CHECK_INPUT(p);
+    CHECK_INPUT(m);
+    CHECK_INPUT(v);
+    CHECK_INPUT(g);
+    int64_t num_elem = p.numel();
+    AT_ASSERTM(m.numel() == num_elem, "number of elements in m and p tensors should be equal");
+    AT_ASSERTM(v.numel() == num_elem, "number of elements in v and p tensors should be equal");
+    AT_ASSERTM(g.numel() == num_elem, "number of elements in g and p tensors should be equal");
+    fused_adam_cuda(p, m, v, g, lr, beta1, beta2, eps, grad_scale, step, bias_correction, decay);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("adam", &adam, "Adam optimized CUDA implementation.");
+}
\ No newline at end of file
--- a/Uni-Core-main/csrc/bak/adam_kernel.cpp
+++ b/Uni-Core-main/csrc/bak/adam_kernel.cpp
+#include "hip/hip_runtime.h"
+#include "ATen/ATen.h"
+#include "ATen/cuda/HIPContext.h"
+#include "ATen/cuda/detail/IndexUtils.cuh"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <stdio.h>
+#include <cmath>
+#include "ATen/TensorUtils.h"
+#include "ATen/AccumulateType.h"
+#include <ATen/cuda/Exceptions.h>
+
+#include "type_shim.h"
+
+template <typename T, typename GRAD_T>
+__global__ void adam_cuda_kernel(
+    GRAD_T* __restrict__ p,
+    T* __restrict__ m,
+    T* __restrict__ v,
+    const GRAD_T * __restrict__ g,
+    const float b1,
+    const float b2,
+    const float eps,
+    const float grad_scale,
+    const float step_size,
+    const size_t tsize,
+    const float decay_size)
+{
+    //Assuming 2D grids and 2D blocks
+    const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
+    const int threadsPerBlock = blockDim.x * blockDim.y;
+    const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
+    const int i = (blockId * threadsPerBlock + threadIdInBlock);
+    const int totThreads = gridDim.x*gridDim.y*threadsPerBlock;
+
+    for (int j = i; j < tsize; j+=totThreads) {
+        // weight decay
+        T cur_p = (T)p[j] * decay_size;
+        T scaled_grad = static_cast<T>(g[j]) / grad_scale;
+        m[j] = b1*m[j] + (1-b1)*scaled_grad;
+        v[j] = b2*v[j] + (1-b2)*scaled_grad*scaled_grad;
+        const float update = m[j] / (sqrtf(v[j]) + eps);
+        p[j] = cur_p - (step_size*update);
+    }
+}
+
+void fused_adam_cuda(
+    at::Tensor & p,
+    at::Tensor & m,
+    at::Tensor & v,
+    at::Tensor & g,
+    float lr,
+    float beta1,
+    float beta2,
+    float eps,
+    float grad_scale,
+    int step,
+    int bias_correction,
+    float decay)
+{
+    //Get tensor size
+    int tsize = p.numel();
+    //Determine #threads and #blocks
+    const int threadsPerBlock = 512;
+    const dim3 blocks((tsize+threadsPerBlock-1)/threadsPerBlock);
+    AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(p), "parameter tensor is too large to be indexed with int32");
+    //Constants
+    float step_size = lr;
+    if (bias_correction == 1) {
+        const double bias_correction1 = 1.0 - std::pow(static_cast<double>(beta1), step);
+        const double bias_correction2 = 1.0 - std::pow(static_cast<double>(beta2), step);
+        step_size = static_cast<float>(lr * std::sqrt(bias_correction2) / bias_correction1);
+    }
+    float decay_size = 1.0;
+    if (decay != 0.0) {
+        decay_size = 1.0 - step_size * decay;
+    }
+    hipStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    if (g.scalar_type() == at::ScalarType::Half || g.scalar_type() == at::ScalarType::BFloat16) {
+        AT_ASSERTM(p.scalar_type() == g.scalar_type(), "expected parameter to be the same type as grad");
+        using namespace at; // prevents "toString is undefined" errors
+        DISPATCH_FLOAT_AND_HALF_AND_BF16(g.scalar_type(), 0, "adam_cuda_kernel",
+            using accscalar_t = at::acc_type<scalar_t_0, true>;
+            adam_cuda_kernel<accscalar_t, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
+                    p.data_ptr<scalar_t_0>(),
+                    m.data_ptr<accscalar_t>(),
+                    v.data_ptr<accscalar_t>(),
+                    g.data_ptr<scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    tsize,
+                    decay_size);
+            );
+    } else {
+        using namespace at;
+        DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
+            adam_cuda_kernel<scalar_t_0, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
+                    p.data_ptr<scalar_t_0>(),
+                    m.data_ptr<scalar_t_0>(),
+                    v.data_ptr<scalar_t_0>(),
+                    g.data_ptr<scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    tsize,
+                    decay_size);
+        );
+    }
+    AT_CUDA_CHECK(hipGetLastError());
+}
--- a/Uni-Core-main/csrc/bak/adam_kernel.cu
+++ b/Uni-Core-main/csrc/bak/adam_kernel.cu
+#include "ATen/ATen.h"
+#include "ATen/cuda/CUDAContext.h"
+#include "ATen/cuda/detail/IndexUtils.cuh"
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <cmath>
+#include "ATen/TensorUtils.h"
+#include "ATen/AccumulateType.h"
+#include <ATen/cuda/Exceptions.h>
+
+#include "type_shim.h"
+
+template <typename T, typename GRAD_T>
+__global__ void adam_cuda_kernel(
+    GRAD_T* __restrict__ p,
+    T* __restrict__ m,
+    T* __restrict__ v,
+    const GRAD_T * __restrict__ g,
+    const float b1,
+    const float b2,
+    const float eps,
+    const float grad_scale,
+    const float step_size,
+    const size_t tsize,
+    const float decay_size)
+{
+    //Assuming 2D grids and 2D blocks
+    const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
+    const int threadsPerBlock = blockDim.x * blockDim.y;
+    const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
+    const int i = (blockId * threadsPerBlock + threadIdInBlock);
+    const int totThreads = gridDim.x*gridDim.y*threadsPerBlock;
+
+    for (int j = i; j < tsize; j+=totThreads) {
+        // weight decay
+        T cur_p = (T)p[j] * decay_size;
+        T scaled_grad = static_cast<T>(g[j]) / grad_scale;
+        m[j] = b1*m[j] + (1-b1)*scaled_grad;
+        v[j] = b2*v[j] + (1-b2)*scaled_grad*scaled_grad;
+        const float update = m[j] / (sqrtf(v[j]) + eps);
+        p[j] = cur_p - (step_size*update);
+    }
+}
+
+void fused_adam_cuda(
+    at::Tensor & p,
+    at::Tensor & m,
+    at::Tensor & v,
+    at::Tensor & g,
+    float lr,
+    float beta1,
+    float beta2,
+    float eps,
+    float grad_scale,
+    int step,
+    int bias_correction,
+    float decay)
+{
+    //Get tensor size
+    int tsize = p.numel();
+    //Determine #threads and #blocks
+    const int threadsPerBlock = 512;
+    const dim3 blocks((tsize+threadsPerBlock-1)/threadsPerBlock);
+    AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(p), "parameter tensor is too large to be indexed with int32");
+    //Constants
+    float step_size = lr;
+    if (bias_correction == 1) {
+        const double bias_correction1 = 1.0 - std::pow(static_cast<double>(beta1), step);
+        const double bias_correction2 = 1.0 - std::pow(static_cast<double>(beta2), step);
+        step_size = static_cast<float>(lr * std::sqrt(bias_correction2) / bias_correction1);
+    }
+    float decay_size = 1.0;
+    if (decay != 0.0) {
+        decay_size = 1.0 - step_size * decay;
+    }
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    if (g.scalar_type() == at::ScalarType::Half || g.scalar_type() == at::ScalarType::BFloat16) {
+        AT_ASSERTM(p.scalar_type() == g.scalar_type(), "expected parameter to be the same type as grad");
+        using namespace at; // prevents "toString is undefined" errors
+        DISPATCH_FLOAT_AND_HALF_AND_BF16(g.scalar_type(), 0, "adam_cuda_kernel",
+            using accscalar_t = at::acc_type<scalar_t_0, true>;
+            adam_cuda_kernel<accscalar_t, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
+                    p.data_ptr<scalar_t_0>(),
+                    m.data_ptr<accscalar_t>(),
+                    v.data_ptr<accscalar_t>(),
+                    g.data_ptr<scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    tsize,
+                    decay_size);
+            );
+    } else {
+        using namespace at;
+        DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
+            adam_cuda_kernel<scalar_t_0, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
+                    p.data_ptr<scalar_t_0>(),
+                    m.data_ptr<scalar_t_0>(),
+                    v.data_ptr<scalar_t_0>(),
+                    g.data_ptr<scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    tsize,
+                    decay_size);
+        );
+    }
+    AT_CUDA_CHECK(cudaGetLastError());
+}
--- a/Uni-Core-main/csrc/bak/adam_kernel.hip
+++ b/Uni-Core-main/csrc/bak/adam_kernel.hip
+// !!! This is a file automatically generated by hipify!!!
+#include "ATen/ATen.h"
+#include "ATen/hip/HIPContext.h"
+#include "ATen/hip/detail/IndexUtils.cuh"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <stdio.h>
+#include <cmath>
+#include "ATen/TensorUtils.h"
+#include "ATen/AccumulateType.h"
+#include <ATen/hip/Exceptions.h>
+
+#include "type_shim.h"
+
+template <typename T, typename GRAD_T>
+__global__ void adam_cuda_kernel(
+    GRAD_T* __restrict__ p,
+    T* __restrict__ m,
+    T* __restrict__ v,
+    const GRAD_T * __restrict__ g,
+    const float b1,
+    const float b2,
+    const float eps,
+    const float grad_scale,
+    const float step_size,
+    const size_t tsize,
+    const float decay_size)
+{
+    //Assuming 2D grids and 2D blocks
+    const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
+    const int threadsPerBlock = blockDim.x * blockDim.y;
+    const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
+    const int i = (blockId * threadsPerBlock + threadIdInBlock);
+    const int totThreads = gridDim.x*gridDim.y*threadsPerBlock;
+
+    for (int j = i; j < tsize; j+=totThreads) {
+        // weight decay
+        T cur_p = (T)p[j] * decay_size;
+        T scaled_grad = static_cast<T>(g[j]) / grad_scale;
+        m[j] = b1*m[j] + (1-b1)*scaled_grad;
+        v[j] = b2*v[j] + (1-b2)*scaled_grad*scaled_grad;
+        const float update = m[j] / (sqrtf(v[j]) + eps);
+        p[j] = cur_p - (step_size*update);
+    }
+}
+
+void fused_adam_cuda(
+    at::Tensor & p,
+    at::Tensor & m,
+    at::Tensor & v,
+    at::Tensor & g,
+    float lr,
+    float beta1,
+    float beta2,
+    float eps,
+    float grad_scale,
+    int step,
+    int bias_correction,
+    float decay)
+{
+    //Get tensor size
+    int tsize = p.numel();
+    //Determine #threads and #blocks
+    const int threadsPerBlock = 512;
+    const dim3 blocks((tsize+threadsPerBlock-1)/threadsPerBlock);
+    AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(p), "parameter tensor is too large to be indexed with int32");
+    //Constants
+    float step_size = lr;
+    if (bias_correction == 1) {
+        const double bias_correction1 = 1.0 - ::pow(static_cast<double>(beta1), step);
+        const double bias_correction2 = 1.0 - ::pow(static_cast<double>(beta2), step);
+        step_size = static_cast<float>(lr * std::sqrt(bias_correction2) / bias_correction1);
+    }
+    float decay_size = 1.0;
+    if (decay != 0.0) {
+        decay_size = 1.0 - step_size * decay;
+    }
+    hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+
+    if (g.scalar_type() == at::ScalarType::Half || g.scalar_type() == at::ScalarType::BFloat16) {
+        AT_ASSERTM(p.scalar_type() == g.scalar_type(), "expected parameter to be the same type as grad");
+        using namespace at; // prevents "toString is undefined" errors
+        DISPATCH_FLOAT_AND_HALF_AND_BF16(g.scalar_type(), 0, "adam_cuda_kernel",
+            using accscalar_t = at::acc_type<scalar_t_0, true>;
+           hipLaunchKernelGGL(( adam_cuda_kernel<accscalar_t, scalar_t_0>), dim3(blocks),dim3(threadsPerBlock), 0, stream, 
+                    p.data_ptr<scalar_t_0>(),
+                    m.data_ptr<accscalar_t>(),
+                    v.data_ptr<accscalar_t>(),
+                    g.data_ptr<scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    tsize,
+                    decay_size);
+            );
+    } else {
+        using namespace at;
+        DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
+           hipLaunchKernelGGL(( adam_cuda_kernel<scalar_t_0, scalar_t_0>), dim3(blocks),dim3(threadsPerBlock), 0, stream, 
+                    p.data_ptr<scalar_t_0>(),
+                    m.data_ptr<scalar_t_0>(),
+                    v.data_ptr<scalar_t_0>(),
+                    g.data_ptr<scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    tsize,
+                    decay_size);
+        );
+    }
+    AT_CUDA_CHECK(hipGetLastError());
+}
--- a/Uni-Core-main/csrc/bak/adam_kernel_hip.cpp
+++ b/Uni-Core-main/csrc/bak/adam_kernel_hip.cpp
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include "ATen/ATen.h"
+#include "ATen/hip/HIPContext.h"
+#include "ATen/hip/detail/IndexUtils.cuh"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <stdio.h>
+#include <cmath>
+#include "ATen/TensorUtils.h"
+#include "ATen/AccumulateType.h"
+#include <ATen/hip/Exceptions.h>
+
+#include "type_shim.h"
+
+template <typename T, typename GRAD_T>
+__global__ void adam_cuda_kernel(
+    GRAD_T* __restrict__ p,
+    T* __restrict__ m,
+    T* __restrict__ v,
+    const GRAD_T * __restrict__ g,
+    const float b1,
+    const float b2,
+    const float eps,
+    const float grad_scale,
+    const float step_size,
+    const size_t tsize,
+    const float decay_size)
+{
+    //Assuming 2D grids and 2D blocks
+    const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
+    const int threadsPerBlock = blockDim.x * blockDim.y;
+    const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
+    const int i = (blockId * threadsPerBlock + threadIdInBlock);
+    const int totThreads = gridDim.x*gridDim.y*threadsPerBlock;
+
+    for (int j = i; j < tsize; j+=totThreads) {
+        // weight decay
+        T cur_p = (T)p[j] * decay_size;
+        T scaled_grad = static_cast<T>(g[j]) / grad_scale;
+        m[j] = b1*m[j] + (1-b1)*scaled_grad;
+        v[j] = b2*v[j] + (1-b2)*scaled_grad*scaled_grad;
+        const float update = m[j] / (sqrtf(v[j]) + eps);
+        p[j] = cur_p - (step_size*update);
+    }
+}
+
+void fused_adam_cuda(
+    at::Tensor & p,
+    at::Tensor & m,
+    at::Tensor & v,
+    at::Tensor & g,
+    float lr,
+    float beta1,
+    float beta2,
+    float eps,
+    float grad_scale,
+    int step,
+    int bias_correction,
+    float decay)
+{
+    //Get tensor size
+    int tsize = p.numel();
+    //Determine #threads and #blocks
+    const int threadsPerBlock = 512;
+    const dim3 blocks((tsize+threadsPerBlock-1)/threadsPerBlock);
+    AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(p), "parameter tensor is too large to be indexed with int32");
+    //Constants
+    float step_size = lr;
+    if (bias_correction == 1) {
+        const double bias_correction1 = 1.0 - std::pow(static_cast<double>(beta1), step);
+        const double bias_correction2 = 1.0 - std::pow(static_cast<double>(beta2), step);
+        step_size = static_cast<float>(lr * std::sqrt(bias_correction2) / bias_correction1);
+    }
+    float decay_size = 1.0;
+    if (decay != 0.0) {
+        decay_size = 1.0 - step_size * decay;
+    }
+    hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+
+    if (g.scalar_type() == at::ScalarType::Half || g.scalar_type() == at::ScalarType::BFloat16) {
+        AT_ASSERTM(p.scalar_type() == g.scalar_type(), "expected parameter to be the same type as grad");
+        using namespace at; // prevents "toString is undefined" errors
+        DISPATCH_FLOAT_AND_HALF_AND_BF16(g.scalar_type(), 0, "adam_cuda_kernel",
+            using accscalar_t = at::acc_type<scalar_t_0, true>;
+           hipLaunchKernelGGL(( adam_cuda_kernel<accscalar_t, scalar_t_0>), dim3(blocks),dim3(threadsPerBlock), 0, stream, 
+                    p.data_ptr<scalar_t_0>(),
+                    m.data_ptr<accscalar_t>(),
+                    v.data_ptr<accscalar_t>(),
+                    g.data_ptr<scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    tsize,
+                    decay_size);
+            );
+    } else {
+        using namespace at;
+        DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
+           hipLaunchKernelGGL(( adam_cuda_kernel<scalar_t_0, scalar_t_0>), dim3(blocks),dim3(threadsPerBlock), 0, stream, 
+                    p.data_ptr<scalar_t_0>(),
+                    m.data_ptr<scalar_t_0>(),
+                    v.data_ptr<scalar_t_0>(),
+                    g.data_ptr<scalar_t_0>(),
+                    beta1,
+                    beta2,
+                    eps,
+                    grad_scale,
+                    step_size,
+                    tsize,
+                    decay_size);
+        );
+    }
+    AT_CUDA_CHECK(hipGetLastError());
+}
--- a/Uni-Core-main/csrc/bak/multi_tensor/interface.cpp
+++ b/Uni-Core-main/csrc/bak/multi_tensor/interface.cpp
+#include <torch/extension.h>
+
+
+at::Tensor multi_tensor_l2norm_cuda(
+  int chunk_size,
+  std::vector<std::vector<at::Tensor>> tensor_lists);
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("l2norm", &multi_tensor_l2norm_cuda,
+        "Computes L2 norm for a list of contiguous tensors");
+}
\ No newline at end of file
--- a/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply.cuh
+++ b/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply.cuh
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <assert.h>
+#include <iostream>
+
+constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+
+template<int n> struct TensorListMetadata
+{
+  void* addresses[n][depth_to_max_tensors[n-1]];
+  int sizes[depth_to_max_tensors[n-1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n-1]];
+  int block_to_chunk[depth_to_max_blocks[n-1]];
+  int start_tensor_this_launch;
+};
+
+
+template<typename T, typename U, typename... ArgTypes>
+__global__ void multi_tensor_apply_kernel(
+    int chunk_size,
+    T tl,
+    U callable,
+    ArgTypes... args)
+{
+  callable(chunk_size, tl, args...);
+}
+
+template<int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+  int block_size,
+  int chunk_size,
+  const std::vector<std::vector<at::Tensor>>& tensor_lists,
+  T callable,
+  ArgTypes... args)
+{
+  TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
+  int len0 = tensor_lists[0].size();
+  TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
+  auto ref_device = tensor_lists[0][0].device();
+  TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
+  auto ref_dtype = tensor_lists[0][0].scalar_type();
+  for (int l = 0; l < tensor_lists.size(); l++)
+  {
+    TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
+    for(int t = 0; t < tensor_lists[l].size(); t++)
+    {
+      bool contiguous_memory = tensor_lists[l][t].is_contiguous();
+#ifdef VERSION_GE_1_5
+      contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast) || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast3d));
+#endif
+      TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
+      TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor");
+      TORCH_CHECK(tensor_lists[l][t].scalar_type() == ref_dtype, "A tensor was not the same dtype as the first tensor");
+      TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
+    }
+  }
+
+  int ntensors = tensor_lists[0].size();
+
+  TensorListMetadata<depth> tl;
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  tl.start_tensor_this_launch = 0;
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for(int t = 0; t < ntensors; t++)
+  {
+    tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+    for(int d = 0; d < depth; d++)
+      tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+    loc_tensor_info++;
+
+    int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
+
+    for(int chunk = 0; chunk < chunks_this_tensor; chunk++)
+    {
+      tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tl.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth-1] &&
+                           chunk == chunks_this_tensor - 1);
+      bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
+      bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
+      if(tensors_full || blocks_full || last_chunk)
+      {
+        multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
+          chunk_size,
+          tl,
+          callable,
+          args...);
+
+        AT_CUDA_CHECK(cudaGetLastError());
+
+        loc_block_info = 0;
+        if(chunk == chunks_this_tensor - 1)
+        {
+          loc_tensor_info = 0;
+          tl.start_tensor_this_launch = t + 1;
+        }
+        else
+        {
+          tl.sizes[0] = tl.sizes[loc_tensor_info-1];
+          for(int d = 0; d < depth; d++)
+            tl.addresses[d][0] = tl.addresses[d][loc_tensor_info-1];
+          loc_tensor_info = 1;
+          tl.start_tensor_this_launch = t;
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
--- a/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply.h
+++ b/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply.h
+#include "hip/hip_runtime.h"
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/HIPContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <assert.h>
+#include <iostream>
+
+constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+
+template<int n> struct TensorListMetadata
+{
+  void* addresses[n][depth_to_max_tensors[n-1]];
+  int sizes[depth_to_max_tensors[n-1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n-1]];
+  int block_to_chunk[depth_to_max_blocks[n-1]];
+  int start_tensor_this_launch;
+};
+
+
+template<typename T, typename U, typename... ArgTypes>
+__global__ void multi_tensor_apply_kernel(
+    int chunk_size,
+    T tl,
+    U callable,
+    ArgTypes... args)
+{
+  callable(chunk_size, tl, args...);
+}
+
+template<int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+  int block_size,
+  int chunk_size,
+  const std::vector<std::vector<at::Tensor>>& tensor_lists,
+  T callable,
+  ArgTypes... args)
+{
+  TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
+  int len0 = tensor_lists[0].size();
+  TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
+  auto ref_device = tensor_lists[0][0].device();
+  TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
+  auto ref_dtype = tensor_lists[0][0].scalar_type();
+  for (int l = 0; l < tensor_lists.size(); l++)
+  {
+    TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
+    for(int t = 0; t < tensor_lists[l].size(); t++)
+    {
+      bool contiguous_memory = tensor_lists[l][t].is_contiguous();
+#ifdef VERSION_GE_1_5
+      contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast) || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast3d));
+#endif
+      TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
+      TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor");
+      TORCH_CHECK(tensor_lists[l][t].scalar_type() == ref_dtype, "A tensor was not the same dtype as the first tensor");
+      TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
+    }
+  }
+
+  int ntensors = tensor_lists[0].size();
+
+  TensorListMetadata<depth> tl;
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  tl.start_tensor_this_launch = 0;
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for(int t = 0; t < ntensors; t++)
+  {
+    tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+    for(int d = 0; d < depth; d++)
+      tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+    loc_tensor_info++;
+
+    int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
+
+    for(int chunk = 0; chunk < chunks_this_tensor; chunk++)
+    {
+      tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tl.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth-1] &&
+                           chunk == chunks_this_tensor - 1);
+      bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
+      bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
+      if(tensors_full || blocks_full || last_chunk)
+      {
+        multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
+          chunk_size,
+          tl,
+          callable,
+          args...);
+
+        AT_CUDA_CHECK(hipGetLastError());
+
+        loc_block_info = 0;
+        if(chunk == chunks_this_tensor - 1)
+        {
+          loc_tensor_info = 0;
+          tl.start_tensor_this_launch = t + 1;
+        }
+        else
+        {
+          tl.sizes[0] = tl.sizes[loc_tensor_info-1];
+          for(int d = 0; d < depth; d++)
+            tl.addresses[d][0] = tl.addresses[d][loc_tensor_info-1];
+          loc_tensor_info = 1;
+          tl.start_tensor_this_launch = t;
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
--- a/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply_hip.cuh
+++ b/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply_hip.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/hip/HIPContext.h>
+#include <ATen/hip/Exceptions.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+
+#include <assert.h>
+#include <iostream>
+
+constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+
+template<int n> struct TensorListMetadata
+{
+  void* addresses[n][depth_to_max_tensors[n-1]];
+  int sizes[depth_to_max_tensors[n-1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n-1]];
+  int block_to_chunk[depth_to_max_blocks[n-1]];
+  int start_tensor_this_launch;
+};
+
+
+template<typename T, typename U, typename... ArgTypes>
+__global__ void multi_tensor_apply_kernel(
+    int chunk_size,
+    T tl,
+    U callable,
+    ArgTypes... args)
+{
+  callable(chunk_size, tl, args...);
+}
+
+template<int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+  int block_size,
+  int chunk_size,
+  const std::vector<std::vector<at::Tensor>>& tensor_lists,
+  T callable,
+  ArgTypes... args)
+{
+  TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
+  int len0 = tensor_lists[0].size();
+  TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
+  auto ref_device = tensor_lists[0][0].device();
+  TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
+  auto ref_dtype = tensor_lists[0][0].scalar_type();
+  for (int l = 0; l < tensor_lists.size(); l++)
+  {
+    TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
+    for(int t = 0; t < tensor_lists[l].size(); t++)
+    {
+      bool contiguous_memory = tensor_lists[l][t].is_contiguous();
+#ifdef VERSION_GE_1_5
+      contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast) || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast3d));
+#endif
+      TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
+      TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor");
+      TORCH_CHECK(tensor_lists[l][t].scalar_type() == ref_dtype, "A tensor was not the same dtype as the first tensor");
+      TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
+    }
+  }
+
+  int ntensors = tensor_lists[0].size();
+
+  TensorListMetadata<depth> tl;
+
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(tensor_lists[0][0]));
+  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+
+  tl.start_tensor_this_launch = 0;
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for(int t = 0; t < ntensors; t++)
+  {
+    tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+    for(int d = 0; d < depth; d++)
+      tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+    loc_tensor_info++;
+
+    int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
+
+    for(int chunk = 0; chunk < chunks_this_tensor; chunk++)
+    {
+      tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tl.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth-1] &&
+                           chunk == chunks_this_tensor - 1);
+      bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
+      bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
+      if(tensors_full || blocks_full || last_chunk)
+      {
+       hipLaunchKernelGGL(( multi_tensor_apply_kernel), dim3(loc_block_info), dim3(block_size), 0, stream, 
+          chunk_size,
+          tl,
+          callable,
+          args...);
+
+        AT_CUDA_CHECK(hipGetLastError());
+
+        loc_block_info = 0;
+        if(chunk == chunks_this_tensor - 1)
+        {
+          loc_tensor_info = 0;
+          tl.start_tensor_this_launch = t + 1;
+        }
+        else
+        {
+          tl.sizes[0] = tl.sizes[loc_tensor_info-1];
+          for(int d = 0; d < depth; d++)
+            tl.addresses[d][0] = tl.addresses[d][loc_tensor_info-1];
+          loc_tensor_info = 1;
+          tl.start_tensor_this_launch = t;
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
--- a/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply_hip.h
+++ b/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply_hip.h
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/hip/HIPContext.h>
+#include <ATen/hip/Exceptions.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+
+#include <assert.h>
+#include <iostream>
+
+constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+
+template<int n> struct TensorListMetadata
+{
+  void* addresses[n][depth_to_max_tensors[n-1]];
+  int sizes[depth_to_max_tensors[n-1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n-1]];
+  int block_to_chunk[depth_to_max_blocks[n-1]];
+  int start_tensor_this_launch;
+};
+
+
+template<typename T, typename U, typename... ArgTypes>
+__global__ void multi_tensor_apply_kernel(
+    int chunk_size,
+    T tl,
+    U callable,
+    ArgTypes... args)
+{
+  callable(chunk_size, tl, args...);
+}
+
+template<int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+  int block_size,
+  int chunk_size,
+  const std::vector<std::vector<at::Tensor>>& tensor_lists,
+  T callable,
+  ArgTypes... args)
+{
+  TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
+  int len0 = tensor_lists[0].size();
+  TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
+  auto ref_device = tensor_lists[0][0].device();
+  TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
+  auto ref_dtype = tensor_lists[0][0].scalar_type();
+  for (int l = 0; l < tensor_lists.size(); l++)
+  {
+    TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
+    for(int t = 0; t < tensor_lists[l].size(); t++)
+    {
+      bool contiguous_memory = tensor_lists[l][t].is_contiguous();
+#ifdef VERSION_GE_1_5
+      contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast) || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast3d));
+#endif
+      TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
+      TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor");
+      TORCH_CHECK(tensor_lists[l][t].scalar_type() == ref_dtype, "A tensor was not the same dtype as the first tensor");
+      TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
+    }
+  }
+
+  int ntensors = tensor_lists[0].size();
+
+  TensorListMetadata<depth> tl;
+
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(tensor_lists[0][0]));
+  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+
+  tl.start_tensor_this_launch = 0;
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for(int t = 0; t < ntensors; t++)
+  {
+    tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+    for(int d = 0; d < depth; d++)
+      tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+    loc_tensor_info++;
+
+    int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
+
+    for(int chunk = 0; chunk < chunks_this_tensor; chunk++)
+    {
+      tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tl.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth-1] &&
+                           chunk == chunks_this_tensor - 1);
+      bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
+      bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
+      if(tensors_full || blocks_full || last_chunk)
+      {
+       hipLaunchKernelGGL(( multi_tensor_apply_kernel), dim3(loc_block_info), dim3(block_size), 0, stream, 
+          chunk_size,
+          tl,
+          callable,
+          args...);
+
+        AT_CUDA_CHECK(hipGetLastError());
+
+        loc_block_info = 0;
+        if(chunk == chunks_this_tensor - 1)
+        {
+          loc_tensor_info = 0;
+          tl.start_tensor_this_launch = t + 1;
+        }
+        else
+        {
+          tl.sizes[0] = tl.sizes[loc_tensor_info-1];
+          for(int d = 0; d < depth; d++)
+            tl.addresses[d][0] = tl.addresses[d][loc_tensor_info-1];
+          loc_tensor_info = 1;
+          tl.start_tensor_this_launch = t;
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
--- a/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.cpp
+++ b/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.cpp
+#include "hip/hip_runtime.h"
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/HIPContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_bf16.h>
+
+#include <assert.h>
+
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
+template<typename x_t>
+struct L2NormFunctor
+{
+  __device__ __forceinline__ void operator()(
+    int chunk_size,
+    TensorListMetadata<1>& tl,
+    float* output)
+  {
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
+    x += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    __shared__ float s_vals[512];
+
+    float vals[ILP];
+    x_t r_x[ILP];
+    for(int i = 0; i < ILP; i++)
+    {
+      vals[i] = 0.0f;
+      r_x[i] = (x_t)0.0f;
+    }
+
+    if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x))
+    {
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
+      {
+        // load
+        load_store(r_x, x, 0 , i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          float next = static_cast<float>(r_x[ii]);
+          vals[ii] += next*next;
+        }
+      }
+    }
+    else
+    {
+      for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
+      {
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+          {
+            float next = static_cast<float>(x[i]);
+            vals[ii] += next*next;
+          }
+        }
+      }
+    }
+
+    float val = 0.f;
+    for(int i = 0; i < ILP; i++)
+        val += vals[i];
+
+    float res = reduce_block_into_lanes(s_vals, val);
+
+    if(threadIdx.x == 0)
+    {
+      output[blockIdx.x] += res;
+    }
+  }
+};
+
+
+
+__global__ void cleanup(
+  float* output,
+  float* ret)
+{
+  __shared__ float vals[512];
+
+  if(blockIdx.x == 0)
+  {
+    float val = 0;
+    if(threadIdx.x < 320)
+      val = output[threadIdx.x];
+
+    float final = reduce_block_into_lanes(vals, val);
+
+    if(threadIdx.x == 0)
+      *ret = sqrt(final);
+  }
+}
+
+
+at::Tensor multi_tensor_l2norm_cuda(
+  int chunk_size,
+  std::vector<std::vector<at::Tensor>> tensor_lists)
+{
+  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
+  auto output = at::zeros({320}, float_options);
+
+  switch (tensor_lists[0][0].scalar_type()){
+    case at::ScalarType::Float: { 
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        tensor_lists,
+        L2NormFunctor<float>(),
+        output.data_ptr<float>()
+      );
+      break; 
+    }
+    case at::ScalarType::Half: { 
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        tensor_lists,
+        L2NormFunctor<half>(),
+        output.data_ptr<float>()
+      );
+      break; 
+    }
+    case at::ScalarType::BFloat16: { 
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        tensor_lists,
+        L2NormFunctor<nv_bfloat16>(),
+        output.data_ptr<float>()
+      );
+      break; 
+    }
+  }
+
+  AT_CUDA_CHECK(hipGetLastError());
+
+  auto ret = at::empty({1}, output.options());
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  cleanup<<<1, 512, 0, stream>>>(
+    output.data_ptr<float>(),
+    ret.data_ptr<float>());
+
+  return ret;
+}
--- a/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.cu
+++ b/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.cu
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+//#include <cuda_bf16.h>
+
+#include <assert.h>
+
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+
+//#define BLOCK_SIZE 512
+#define BLOCK_SIZE 256
+#define ILP 4
+
+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
+template<typename x_t>
+struct L2NormFunctor
+{
+  __device__ __forceinline__ void operator()(
+    int chunk_size,
+    TensorListMetadata<1>& tl,
+    float* output)
+  {
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
+    x += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    __shared__ float s_vals[512];
+
+    float vals[ILP];
+    x_t r_x[ILP];
+    for(int i = 0; i < ILP; i++)
+    {
+      vals[i] = 0.0f;
+      r_x[i] = (x_t)0.0f;
+    }
+
+    if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x))
+    {
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
+      {
+        // load
+        load_store(r_x, x, 0 , i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          float next = static_cast<float>(r_x[ii]);
+          vals[ii] += next*next;
+        }
+      }
+    }
+    else
+    {
+      for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
+      {
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+          {
+            float next = static_cast<float>(x[i]);
+            vals[ii] += next*next;
+          }
+        }
+      }
+    }
+
+    float val = 0.f;
+    for(int i = 0; i < ILP; i++)
+        val += vals[i];
+
+    float res = reduce_block_into_lanes(s_vals, val);
+
+    if(threadIdx.x == 0)
+    {
+      output[blockIdx.x] += res;
+    }
+  }
+};
+
+
+
+__global__ void cleanup(
+  float* output,
+  float* ret)
+{
+  __shared__ float vals[512];
+
+  if(blockIdx.x == 0)
+  {
+    float val = 0;
+    if(threadIdx.x < 320)
+      val = output[threadIdx.x];
+
+    float final = reduce_block_into_lanes(vals, val);
+
+    if(threadIdx.x == 0)
+      *ret = sqrt(final);
+  }
+}
+
+
+at::Tensor multi_tensor_l2norm_cuda(
+  int chunk_size,
+  std::vector<std::vector<at::Tensor>> tensor_lists)
+{
+  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
+  auto output = at::zeros({320}, float_options);
+
+  switch (tensor_lists[0][0].scalar_type()){
+    case at::ScalarType::Float: { 
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        tensor_lists,
+        L2NormFunctor<float>(),
+        output.data_ptr<float>()
+      );
+      break; 
+    }
+    case at::ScalarType::Half: { 
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        tensor_lists,
+        L2NormFunctor<half>(),
+        output.data_ptr<float>()
+      );
+      break; 
+    }
+    case at::ScalarType::BFloat16: { 
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        tensor_lists,
+        L2NormFunctor<nv_bfloat16>(),
+        output.data_ptr<float>()
+      );
+      break; 
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  auto ret = at::empty({1}, output.options());
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  cleanup<<<1, 512, 0, stream>>>(
+    output.data_ptr<float>(),
+    ret.data_ptr<float>());
+
+  return ret;
+}
--- a/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.hip
+++ b/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.hip
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/hip/HIPContext.h>
+#include <ATen/hip/Exceptions.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+//#include <cuda_bf16.h>
+
+#include <assert.h>
+
+#include "type_shim.h"
+#include "multi_tensor_apply_hip.cuh"
+
+//#define BLOCK_SIZE 512
+#define BLOCK_SIZE 256
+#define ILP 4
+
+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
+template<typename x_t>
+struct L2NormFunctor
+{
+  __device__ __forceinline__ void operator()(
+    int chunk_size,
+    TensorListMetadata<1>& tl,
+    float* output)
+  {
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
+    x += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    __shared__ float s_vals[512];
+
+    float vals[ILP];
+    x_t r_x[ILP];
+    for(int i = 0; i < ILP; i++)
+    {
+      vals[i] = 0.0f;
+      r_x[i] = (x_t)0.0f;
+    }
+
+    if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x))
+    {
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
+      {
+        // load
+        load_store(r_x, x, 0 , i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          float next = static_cast<float>(r_x[ii]);
+          vals[ii] += next*next;
+        }
+      }
+    }
+    else
+    {
+      for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
+      {
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+          {
+            float next = static_cast<float>(x[i]);
+            vals[ii] += next*next;
+          }
+        }
+      }
+    }
+
+    float val = 0.f;
+    for(int i = 0; i < ILP; i++)
+        val += vals[i];
+
+    float res = reduce_block_into_lanes(s_vals, val);
+
+    if(threadIdx.x == 0)
+    {
+      output[blockIdx.x] += res;
+    }
+  }
+};
+
+
+
+__global__ void cleanup(
+  float* output,
+  float* ret)
+{
+  __shared__ float vals[512];
+
+  if(blockIdx.x == 0)
+  {
+    float val = 0;
+    if(threadIdx.x < 320)
+      val = output[threadIdx.x];
+
+    float final = reduce_block_into_lanes(vals, val);
+
+    if(threadIdx.x == 0)
+      *ret = sqrt(final);
+  }
+}
+
+
+at::Tensor multi_tensor_l2norm_cuda(
+  int chunk_size,
+  std::vector<std::vector<at::Tensor>> tensor_lists)
+{
+  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
+  auto output = at::zeros({320}, float_options);
+
+  switch (tensor_lists[0][0].scalar_type()){
+    case at::ScalarType::Float: { 
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        tensor_lists,
+        L2NormFunctor<float>(),
+        output.data_ptr<float>()
+      );
+      break; 
+    }
+    case at::ScalarType::Half: { 
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        tensor_lists,
+        L2NormFunctor<half>(),
+        output.data_ptr<float>()
+      );
+      break; 
+    }
+    case at::ScalarType::BFloat16: { 
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        tensor_lists,
+        L2NormFunctor<nv_bfloat16>(),
+        output.data_ptr<float>()
+      );
+      break; 
+    }
+  }
+
+  AT_CUDA_CHECK(hipGetLastError());
+
+  auto ret = at::empty({1}, output.options());
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(output));
+  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+ hipLaunchKernelGGL(( cleanup), dim3(1), dim3(512), 0, stream, 
+    output.data_ptr<float>(),
+    ret.data_ptr<float>());
+
+  return ret;
+}
--- a/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel_hip.cpp
+++ b/Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel_hip.cpp
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/hip/HIPContext.h>
+#include <ATen/hip/Exceptions.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+#include <cuda_bf16.h>
+
+#include <assert.h>
+
+#include "type_shim.h"
+#include "multi_tensor_apply_hip.cuh"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
+template<typename x_t>
+struct L2NormFunctor
+{
+  __device__ __forceinline__ void operator()(
+    int chunk_size,
+    TensorListMetadata<1>& tl,
+    float* output)
+  {
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
+    x += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    __shared__ float s_vals[512];
+
+    float vals[ILP];
+    x_t r_x[ILP];
+    for(int i = 0; i < ILP; i++)
+    {
+      vals[i] = 0.0f;
+      r_x[i] = (x_t)0.0f;
+    }
+
+    if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x))
+    {
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
+      {
+        // load
+        load_store(r_x, x, 0 , i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          float next = static_cast<float>(r_x[ii]);
+          vals[ii] += next*next;
+        }
+      }
+    }
+    else
+    {
+      for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
+      {
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+          {
+            float next = static_cast<float>(x[i]);
+            vals[ii] += next*next;
+          }
+        }
+      }
+    }
+
+    float val = 0.f;
+    for(int i = 0; i < ILP; i++)
+        val += vals[i];
+
+    float res = reduce_block_into_lanes(s_vals, val);
+
+    if(threadIdx.x == 0)
+    {
+      output[blockIdx.x] += res;
+    }
+  }
+};
+
+
+
+__global__ void cleanup(
+  float* output,
+  float* ret)
+{
+  __shared__ float vals[512];
+
+  if(blockIdx.x == 0)
+  {
+    float val = 0;
+    if(threadIdx.x < 320)
+      val = output[threadIdx.x];
+
+    float final = reduce_block_into_lanes(vals, val);
+
+    if(threadIdx.x == 0)
+      *ret = sqrt(final);
+  }
+}
+
+
+at::Tensor multi_tensor_l2norm_cuda(
+  int chunk_size,
+  std::vector<std::vector<at::Tensor>> tensor_lists)
+{
+  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
+  auto output = at::zeros({320}, float_options);
+
+  switch (tensor_lists[0][0].scalar_type()){
+    case at::ScalarType::Float: { 
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        tensor_lists,
+        L2NormFunctor<float>(),
+        output.data_ptr<float>()
+      );
+      break; 
+    }
+    case at::ScalarType::Half: { 
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        tensor_lists,
+        L2NormFunctor<half>(),
+        output.data_ptr<float>()
+      );
+      break; 
+    }
+    case at::ScalarType::BFloat16: { 
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        tensor_lists,
+        L2NormFunctor<nv_bfloat16>(),
+        output.data_ptr<float>()
+      );
+      break; 
+    }
+  }
+
+  AT_CUDA_CHECK(hipGetLastError());
+
+  auto ret = at::empty({1}, output.options());
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(output));
+  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+ hipLaunchKernelGGL(( cleanup), dim3(1), dim3(512), 0, stream, 
+    output.data_ptr<float>(),
+    ret.data_ptr<float>());
+
+  return ret;
+}
--- a/Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.cpp
+++ b/Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.cpp
+#include "hip/hip_runtime.h"
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/cuda/detail/TensorInfo.cuh>
+#include <c10/cuda/CUDAMathCompat.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <cuda_bf16.h>
+#include <hiprand_kernel.h>
+#include <ATen/cuda/HIPContext.h>
+#include <torch/extension.h>
+#include <math.h>
+#include <iostream>
+
+union float_int_32
+{
+    uint32_t i;
+    float f;
+};
+
+__global__ void fp32_to_bf16(
+    const float* input,
+    nv_bfloat16* output,
+    const int tsize,
+    uint64_t seed,
+    uint64_t offset) {
+
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    if (i < tsize) {
+        float_int_32 d;
+        d.f = input[i];
+        hiprandStatePhilox4_32_10_t state;
+        hiprand_init(seed, i, offset, &state);
+        d.i += hiprand(&state) & 0x0000ffff;
+        output[i] = __float2bfloat16_rz(d.f);
+    }
+}
+
+void fused_fp32_to_bf16_sr_cuda(
+    at::Tensor & input,
+    at::Tensor & output)
+{
+    int tsize = input.numel();
+    const int threadsPerBlock = 512;
+    const int blocks = (tsize + threadsPerBlock - 1) / threadsPerBlock;
+    AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(input), "parameter tensor is too large to be indexed with int32");
+    AT_ASSERTM(input.scalar_type() == at::ScalarType::Float, "expected input to be float32 tensor");
+    AT_ASSERTM(output.scalar_type() == at::ScalarType::BFloat16, "expected output to be bfloat16 tensor");
+    auto gen = at::cuda::detail::getDefaultCUDAGenerator();
+    std::pair<uint64_t, uint64_t> rng_engine_inputs;
+    {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen.mutex());
+        rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_engine_inputs(1);
+    }
+    uint64_t seed = std::get<0>(rng_engine_inputs);
+    uint64_t offset = std::get<1>(rng_engine_inputs);
+    hipStream_t stream = at::cuda::getCurrentCUDAStream();
+    fp32_to_bf16<<<blocks, threadsPerBlock, 0, stream>>>(
+        (const float*)input.data_ptr(),
+        (nv_bfloat16*)output.data_ptr(),
+        tsize,
+        seed,
+        offset);
+    AT_CUDA_CHECK(hipGetLastError());
+}
+
--- a/Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.cu
+++ b/Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.cu
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/cuda/detail/TensorInfo.cuh>
+#include <c10/cuda/CUDAMathCompat.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <curand_kernel.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include <math.h>
+#include <iostream>
+
+union float_int_32
+{
+    uint32_t i;
+    float f;
+};
+
+__global__ void fp32_to_bf16(
+    const float* input,
+    nv_bfloat16* output,
+    const int tsize,
+    uint64_t seed,
+    uint64_t offset) {
+
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    if (i < tsize) {
+        float_int_32 d;
+        d.f = input[i];
+        curandStatePhilox4_32_10_t state;
+        curand_init(seed, i, offset, &state);
+        d.i += curand(&state) & 0x0000ffff;
+        output[i] = __float2bfloat16_rz(d.f);
+    }
+}
+
+void fused_fp32_to_bf16_sr_cuda(
+    at::Tensor & input,
+    at::Tensor & output)
+{
+    int tsize = input.numel();
+    const int threadsPerBlock = 512;
+    const int blocks = (tsize + threadsPerBlock - 1) / threadsPerBlock;
+    AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(input), "parameter tensor is too large to be indexed with int32");
+    AT_ASSERTM(input.scalar_type() == at::ScalarType::Float, "expected input to be float32 tensor");
+    AT_ASSERTM(output.scalar_type() == at::ScalarType::BFloat16, "expected output to be bfloat16 tensor");
+    auto gen = at::cuda::detail::getDefaultCUDAGenerator();
+    std::pair<uint64_t, uint64_t> rng_engine_inputs;
+    {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen.mutex());
+        rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_engine_inputs(1);
+    }
+    uint64_t seed = std::get<0>(rng_engine_inputs);
+    uint64_t offset = std::get<1>(rng_engine_inputs);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    fp32_to_bf16<<<blocks, threadsPerBlock, 0, stream>>>(
+        (const float*)input.data_ptr(),
+        (nv_bfloat16*)output.data_ptr(),
+        tsize,
+        seed,
+        offset);
+    AT_CUDA_CHECK(cudaGetLastError());
+}
+
--- a/Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.hip
+++ b/Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.hip
+// !!! This is a file automatically generated by hipify!!!
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/hip/HIPGeneratorImpl.h>
+#include <ATen/hip/detail/IndexUtils.cuh>
+#include <ATen/hip/detail/TensorInfo.cuh>
+#include <c10/hip/HIPMathCompat.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <cuda_bf16.h>
+#include <hiprand/hiprand_kernel.h>
+#include <ATen/hip/HIPContext.h>
+#include <torch/extension.h>
+#include <math.h>
+#include <iostream>
+
+union float_int_32
+{
+    uint32_t i;
+    float f;
+};
+
+__global__ void fp32_to_bf16(
+    const float* input,
+    nv_bfloat16* output,
+    const int tsize,
+    uint64_t seed,
+    uint64_t offset) {
+
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    if (i < tsize) {
+        float_int_32 d;
+        d.f = input[i];
+        hiprandStatePhilox4_32_10_t state;
+        hiprand_init(seed, i, offset, &state);
+        d.i += hiprand(&state) & 0x0000ffff;
+        output[i] = __float2bfloat16_rz(d.f);
+    }
+}
+
+void fused_fp32_to_bf16_sr_cuda(
+    at::Tensor & input,
+    at::Tensor & output)
+{
+    int tsize = input.numel();
+    const int threadsPerBlock = 512;
+    const int blocks = (tsize + threadsPerBlock - 1) / threadsPerBlock;
+    AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(input), "parameter tensor is too large to be indexed with int32");
+    AT_ASSERTM(input.scalar_type() == at::ScalarType::Float, "expected input to be float32 tensor");
+    AT_ASSERTM(output.scalar_type() == at::ScalarType::BFloat16, "expected output to be bfloat16 tensor");
+    auto gen = at::cuda::detail::getDefaultCUDAGenerator();
+    std::pair<uint64_t, uint64_t> rng_engine_inputs;
+    {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen.mutex());
+        rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_engine_inputs(1);
+    }
+    uint64_t seed = std::get<0>(rng_engine_inputs);
+    uint64_t offset = std::get<1>(rng_engine_inputs);
+    hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+   hipLaunchKernelGGL(( fp32_to_bf16), dim3(blocks), dim3(threadsPerBlock), 0, stream, 
+        (const float*)input.data_ptr(),
+        (nv_bfloat16*)output.data_ptr(),
+        tsize,
+        seed,
+        offset);
+    AT_CUDA_CHECK(hipGetLastError());
+}
+
--- a/Uni-Core-main/csrc/bak/rounding/fp32_to_bf16_hip.cpp
+++ b/Uni-Core-main/csrc/bak/rounding/fp32_to_bf16_hip.cpp
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/hip/HIPGeneratorImpl.h>
+#include <ATen/hip/detail/IndexUtils.cuh>
+#include <ATen/hip/detail/TensorInfo.cuh>
+#include <c10/hip/HIPMathCompat.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <cuda_bf16.h>
+#include <hiprand_kernel.h>
+#include <ATen/hip/HIPContext.h>
+#include <torch/extension.h>
+#include <math.h>
+#include <iostream>
+
+union float_int_32
+{
+    uint32_t i;
+    float f;
+};
+
+__global__ void fp32_to_bf16(
+    const float* input,
+    nv_bfloat16* output,
+    const int tsize,
+    uint64_t seed,
+    uint64_t offset) {
+
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    if (i < tsize) {
+        float_int_32 d;
+        d.f = input[i];
+        hiprandStatePhilox4_32_10_t state;
+        hiprand_init(seed, i, offset, &state);
+        d.i += hiprand(&state) & 0x0000ffff;
+        output[i] = __float2bfloat16_rz(d.f);
+    }
+}
+
+void fused_fp32_to_bf16_sr_cuda(
+    at::Tensor & input,
+    at::Tensor & output)
+{
+    int tsize = input.numel();
+    const int threadsPerBlock = 512;
+    const int blocks = (tsize + threadsPerBlock - 1) / threadsPerBlock;
+    AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(input), "parameter tensor is too large to be indexed with int32");
+    AT_ASSERTM(input.scalar_type() == at::ScalarType::Float, "expected input to be float32 tensor");
+    AT_ASSERTM(output.scalar_type() == at::ScalarType::BFloat16, "expected output to be bfloat16 tensor");
+    auto gen = at::cuda::detail::getDefaultCUDAGenerator();
+    std::pair<uint64_t, uint64_t> rng_engine_inputs;
+    {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen.mutex());
+        rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_engine_inputs(1);
+    }
+    uint64_t seed = std::get<0>(rng_engine_inputs);
+    uint64_t offset = std::get<1>(rng_engine_inputs);
+    hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+   hipLaunchKernelGGL(( fp32_to_bf16), dim3(blocks), dim3(threadsPerBlock), 0, stream, 
+        (const float*)input.data_ptr(),
+        (nv_bfloat16*)output.data_ptr(),
+        tsize,
+        seed,
+        offset);
+    AT_CUDA_CHECK(hipGetLastError());
+}
+
--- a/Uni-Core-main/csrc/bak/rounding/interface.cpp
+++ b/Uni-Core-main/csrc/bak/rounding/interface.cpp
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+
+void fused_fp32_to_bf16_sr_cuda(at::Tensor & input, at::Tensor & output);
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+void fused_fp32_to_bf16_sr(at::Tensor & input, at::Tensor & output) {
+    CHECK_INPUT(input);
+    CHECK_INPUT(output);
+    int64_t num_elem = input.numel();
+    AT_ASSERTM(output.numel() == num_elem, "number of elements in input ond output tensors should be equal");
+    fused_fp32_to_bf16_sr_cuda(input, output);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("fp32_to_bf16_sr", &fused_fp32_to_bf16_sr, "fused fp32 to bf16 random rounding");
+}