Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Uni-Fold_pytorch
Commits
a1c29028
Commit
a1c29028
authored
Apr 17, 2023
by
zhangqha
Browse files
update uni-fold
parents
Pipeline
#183
canceled with stages
Changes
312
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2098 additions
and
0 deletions
+2098
-0
Uni-Core-main/csrc/adam/adam_kernel.hip
Uni-Core-main/csrc/adam/adam_kernel.hip
+116
-0
Uni-Core-main/csrc/adam/interface.cpp
Uni-Core-main/csrc/adam/interface.cpp
+24
-0
Uni-Core-main/csrc/bak/adam_kernel.cpp
Uni-Core-main/csrc/bak/adam_kernel.cpp
+116
-0
Uni-Core-main/csrc/bak/adam_kernel.cu
Uni-Core-main/csrc/bak/adam_kernel.cu
+115
-0
Uni-Core-main/csrc/bak/adam_kernel.hip
Uni-Core-main/csrc/bak/adam_kernel.hip
+116
-0
Uni-Core-main/csrc/bak/adam_kernel_hip.cpp
Uni-Core-main/csrc/bak/adam_kernel_hip.cpp
+117
-0
Uni-Core-main/csrc/bak/multi_tensor/interface.cpp
Uni-Core-main/csrc/bak/multi_tensor/interface.cpp
+13
-0
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply.cuh
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply.cuh
+120
-0
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply.h
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply.h
+121
-0
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply_hip.cuh
...ore-main/csrc/bak/multi_tensor/multi_tensor_apply_hip.cuh
+122
-0
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply_hip.h
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply_hip.h
+122
-0
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.cpp
...main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.cpp
+172
-0
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.cu
...-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.cu
+172
-0
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.hip
...main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.hip
+174
-0
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel_hip.cpp
.../csrc/bak/multi_tensor/multi_tensor_l2norm_kernel_hip.cpp
+173
-0
Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.cpp
Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.cpp
+70
-0
Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.cu
Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.cu
+69
-0
Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.hip
Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.hip
+70
-0
Uni-Core-main/csrc/bak/rounding/fp32_to_bf16_hip.cpp
Uni-Core-main/csrc/bak/rounding/fp32_to_bf16_hip.cpp
+71
-0
Uni-Core-main/csrc/bak/rounding/interface.cpp
Uni-Core-main/csrc/bak/rounding/interface.cpp
+25
-0
No files found.
Uni-Core-main/csrc/adam/adam_kernel.hip
0 → 100644
View file @
a1c29028
// !!! This is a file automatically generated by hipify!!!
#include "ATen/ATen.h"
#include "ATen/hip/HIPContext.h"
#include "ATen/hip/detail/IndexUtils.cuh"
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#include <stdio.h>
#include <cmath>
#include "ATen/TensorUtils.h"
#include "ATen/AccumulateType.h"
#include <ATen/hip/Exceptions.h>
#include "type_shim_hip.h"
template <typename T, typename GRAD_T>
__global__ void adam_cuda_kernel(
GRAD_T* __restrict__ p,
T* __restrict__ m,
T* __restrict__ v,
const GRAD_T * __restrict__ g,
const float b1,
const float b2,
const float eps,
const float grad_scale,
const float step_size,
const size_t tsize,
const float decay_size)
{
//Assuming 2D grids and 2D blocks
const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
const int threadsPerBlock = blockDim.x * blockDim.y;
const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
const int i = (blockId * threadsPerBlock + threadIdInBlock);
const int totThreads = gridDim.x*gridDim.y*threadsPerBlock;
for (int j = i; j < tsize; j+=totThreads) {
// weight decay
T cur_p = (T)p[j] * decay_size;
T scaled_grad = static_cast<T>(g[j]) / grad_scale;
m[j] = b1*m[j] + (1-b1)*scaled_grad;
v[j] = b2*v[j] + (1-b2)*scaled_grad*scaled_grad;
const float update = m[j] / (sqrtf(v[j]) + eps);
p[j] = cur_p - (step_size*update);
}
}
void fused_adam_cuda(
at::Tensor & p,
at::Tensor & m,
at::Tensor & v,
at::Tensor & g,
float lr,
float beta1,
float beta2,
float eps,
float grad_scale,
int step,
int bias_correction,
float decay)
{
//Get tensor size
int tsize = p.numel();
//Determine #threads and #blocks
const int threadsPerBlock = 512;
const dim3 blocks((tsize+threadsPerBlock-1)/threadsPerBlock);
AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(p), "parameter tensor is too large to be indexed with int32");
//Constants
float step_size = lr;
if (bias_correction == 1) {
const double bias_correction1 = 1.0 - ::pow(static_cast<double>(beta1), step);
const double bias_correction2 = 1.0 - ::pow(static_cast<double>(beta2), step);
step_size = static_cast<float>(lr * std::sqrt(bias_correction2) / bias_correction1);
}
float decay_size = 1.0;
if (decay != 0.0) {
decay_size = 1.0 - step_size * decay;
}
hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
if (g.scalar_type() == at::ScalarType::Half || g.scalar_type() == at::ScalarType::BFloat16) {
AT_ASSERTM(p.scalar_type() == g.scalar_type(), "expected parameter to be the same type as grad");
using namespace at; // prevents "toString is undefined" errors
DISPATCH_FLOAT_AND_HALF_AND_BF16(g.scalar_type(), 0, "adam_cuda_kernel",
using accscalar_t = at::acc_type<scalar_t_0, true>;
hipLaunchKernelGGL(( adam_cuda_kernel<accscalar_t, scalar_t_0>), dim3(blocks),dim3(threadsPerBlock), 0, stream,
p.data_ptr<scalar_t_0>(),
m.data_ptr<accscalar_t>(),
v.data_ptr<accscalar_t>(),
g.data_ptr<scalar_t_0>(),
beta1,
beta2,
eps,
grad_scale,
step_size,
tsize,
decay_size);
);
} else {
using namespace at;
DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
hipLaunchKernelGGL(( adam_cuda_kernel<scalar_t_0, scalar_t_0>), dim3(blocks),dim3(threadsPerBlock), 0, stream,
p.data_ptr<scalar_t_0>(),
m.data_ptr<scalar_t_0>(),
v.data_ptr<scalar_t_0>(),
g.data_ptr<scalar_t_0>(),
beta1,
beta2,
eps,
grad_scale,
step_size,
tsize,
decay_size);
);
}
AT_CUDA_CHECK(hipGetLastError());
}
Uni-Core-main/csrc/adam/interface.cpp
0 → 100644
View file @
a1c29028
#include <torch/extension.h>
void
fused_adam_cuda
(
at
::
Tensor
&
p
,
at
::
Tensor
&
m
,
at
::
Tensor
&
v
,
at
::
Tensor
&
g
,
float
lr
,
float
beta1
,
float
beta2
,
float
eps
,
float
grad_scale
,
int
step
,
int
bias_correction
,
float
decay
);
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
void
adam
(
at
::
Tensor
&
p
,
at
::
Tensor
&
m
,
at
::
Tensor
&
v
,
at
::
Tensor
&
g
,
float
lr
,
float
beta1
,
float
beta2
,
float
eps
,
float
grad_scale
,
int
step
,
int
bias_correction
,
float
decay
)
{
CHECK_INPUT
(
p
);
CHECK_INPUT
(
m
);
CHECK_INPUT
(
v
);
CHECK_INPUT
(
g
);
int64_t
num_elem
=
p
.
numel
();
AT_ASSERTM
(
m
.
numel
()
==
num_elem
,
"number of elements in m and p tensors should be equal"
);
AT_ASSERTM
(
v
.
numel
()
==
num_elem
,
"number of elements in v and p tensors should be equal"
);
AT_ASSERTM
(
g
.
numel
()
==
num_elem
,
"number of elements in g and p tensors should be equal"
);
fused_adam_cuda
(
p
,
m
,
v
,
g
,
lr
,
beta1
,
beta2
,
eps
,
grad_scale
,
step
,
bias_correction
,
decay
);
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"adam"
,
&
adam
,
"Adam optimized CUDA implementation."
);
}
\ No newline at end of file
Uni-Core-main/csrc/bak/adam_kernel.cpp
0 → 100644
View file @
a1c29028
#include "hip/hip_runtime.h"
#include "ATen/ATen.h"
#include "ATen/cuda/HIPContext.h"
#include "ATen/cuda/detail/IndexUtils.cuh"
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#include <stdio.h>
#include <cmath>
#include "ATen/TensorUtils.h"
#include "ATen/AccumulateType.h"
#include <ATen/cuda/Exceptions.h>
#include "type_shim.h"
template
<
typename
T
,
typename
GRAD_T
>
__global__
void
adam_cuda_kernel
(
GRAD_T
*
__restrict__
p
,
T
*
__restrict__
m
,
T
*
__restrict__
v
,
const
GRAD_T
*
__restrict__
g
,
const
float
b1
,
const
float
b2
,
const
float
eps
,
const
float
grad_scale
,
const
float
step_size
,
const
size_t
tsize
,
const
float
decay_size
)
{
//Assuming 2D grids and 2D blocks
const
int
blockId
=
gridDim
.
x
*
blockIdx
.
y
+
blockIdx
.
x
;
const
int
threadsPerBlock
=
blockDim
.
x
*
blockDim
.
y
;
const
int
threadIdInBlock
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
i
=
(
blockId
*
threadsPerBlock
+
threadIdInBlock
);
const
int
totThreads
=
gridDim
.
x
*
gridDim
.
y
*
threadsPerBlock
;
for
(
int
j
=
i
;
j
<
tsize
;
j
+=
totThreads
)
{
// weight decay
T
cur_p
=
(
T
)
p
[
j
]
*
decay_size
;
T
scaled_grad
=
static_cast
<
T
>
(
g
[
j
])
/
grad_scale
;
m
[
j
]
=
b1
*
m
[
j
]
+
(
1
-
b1
)
*
scaled_grad
;
v
[
j
]
=
b2
*
v
[
j
]
+
(
1
-
b2
)
*
scaled_grad
*
scaled_grad
;
const
float
update
=
m
[
j
]
/
(
sqrtf
(
v
[
j
])
+
eps
);
p
[
j
]
=
cur_p
-
(
step_size
*
update
);
}
}
void
fused_adam_cuda
(
at
::
Tensor
&
p
,
at
::
Tensor
&
m
,
at
::
Tensor
&
v
,
at
::
Tensor
&
g
,
float
lr
,
float
beta1
,
float
beta2
,
float
eps
,
float
grad_scale
,
int
step
,
int
bias_correction
,
float
decay
)
{
//Get tensor size
int
tsize
=
p
.
numel
();
//Determine #threads and #blocks
const
int
threadsPerBlock
=
512
;
const
dim3
blocks
((
tsize
+
threadsPerBlock
-
1
)
/
threadsPerBlock
);
AT_ASSERTM
(
at
::
cuda
::
detail
::
canUse32BitIndexMath
(
p
),
"parameter tensor is too large to be indexed with int32"
);
//Constants
float
step_size
=
lr
;
if
(
bias_correction
==
1
)
{
const
double
bias_correction1
=
1.0
-
std
::
pow
(
static_cast
<
double
>
(
beta1
),
step
);
const
double
bias_correction2
=
1.0
-
std
::
pow
(
static_cast
<
double
>
(
beta2
),
step
);
step_size
=
static_cast
<
float
>
(
lr
*
std
::
sqrt
(
bias_correction2
)
/
bias_correction1
);
}
float
decay_size
=
1.0
;
if
(
decay
!=
0.0
)
{
decay_size
=
1.0
-
step_size
*
decay
;
}
hipStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
if
(
g
.
scalar_type
()
==
at
::
ScalarType
::
Half
||
g
.
scalar_type
()
==
at
::
ScalarType
::
BFloat16
)
{
AT_ASSERTM
(
p
.
scalar_type
()
==
g
.
scalar_type
(),
"expected parameter to be the same type as grad"
);
using
namespace
at
;
// prevents "toString is undefined" errors
DISPATCH_FLOAT_AND_HALF_AND_BF16
(
g
.
scalar_type
(),
0
,
"adam_cuda_kernel"
,
using
accscalar_t
=
at
::
acc_type
<
scalar_t_0
,
true
>
;
adam_cuda_kernel
<
accscalar_t
,
scalar_t_0
><<<
blocks
,
threadsPerBlock
,
0
,
stream
>>>
(
p
.
data_ptr
<
scalar_t_0
>
(),
m
.
data_ptr
<
accscalar_t
>
(),
v
.
data_ptr
<
accscalar_t
>
(),
g
.
data_ptr
<
scalar_t_0
>
(),
beta1
,
beta2
,
eps
,
grad_scale
,
step_size
,
tsize
,
decay_size
);
);
}
else
{
using
namespace
at
;
DISPATCH_DOUBLE_AND_FLOAT
(
g
.
scalar_type
(),
0
,
"adam_cuda_kernel"
,
adam_cuda_kernel
<
scalar_t_0
,
scalar_t_0
><<<
blocks
,
threadsPerBlock
,
0
,
stream
>>>
(
p
.
data_ptr
<
scalar_t_0
>
(),
m
.
data_ptr
<
scalar_t_0
>
(),
v
.
data_ptr
<
scalar_t_0
>
(),
g
.
data_ptr
<
scalar_t_0
>
(),
beta1
,
beta2
,
eps
,
grad_scale
,
step_size
,
tsize
,
decay_size
);
);
}
AT_CUDA_CHECK
(
hipGetLastError
());
}
Uni-Core-main/csrc/bak/adam_kernel.cu
0 → 100644
View file @
a1c29028
#include "ATen/ATen.h"
#include "ATen/cuda/CUDAContext.h"
#include "ATen/cuda/detail/IndexUtils.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <cmath>
#include "ATen/TensorUtils.h"
#include "ATen/AccumulateType.h"
#include <ATen/cuda/Exceptions.h>
#include "type_shim.h"
template
<
typename
T
,
typename
GRAD_T
>
__global__
void
adam_cuda_kernel
(
GRAD_T
*
__restrict__
p
,
T
*
__restrict__
m
,
T
*
__restrict__
v
,
const
GRAD_T
*
__restrict__
g
,
const
float
b1
,
const
float
b2
,
const
float
eps
,
const
float
grad_scale
,
const
float
step_size
,
const
size_t
tsize
,
const
float
decay_size
)
{
//Assuming 2D grids and 2D blocks
const
int
blockId
=
gridDim
.
x
*
blockIdx
.
y
+
blockIdx
.
x
;
const
int
threadsPerBlock
=
blockDim
.
x
*
blockDim
.
y
;
const
int
threadIdInBlock
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
i
=
(
blockId
*
threadsPerBlock
+
threadIdInBlock
);
const
int
totThreads
=
gridDim
.
x
*
gridDim
.
y
*
threadsPerBlock
;
for
(
int
j
=
i
;
j
<
tsize
;
j
+=
totThreads
)
{
// weight decay
T
cur_p
=
(
T
)
p
[
j
]
*
decay_size
;
T
scaled_grad
=
static_cast
<
T
>
(
g
[
j
])
/
grad_scale
;
m
[
j
]
=
b1
*
m
[
j
]
+
(
1
-
b1
)
*
scaled_grad
;
v
[
j
]
=
b2
*
v
[
j
]
+
(
1
-
b2
)
*
scaled_grad
*
scaled_grad
;
const
float
update
=
m
[
j
]
/
(
sqrtf
(
v
[
j
])
+
eps
);
p
[
j
]
=
cur_p
-
(
step_size
*
update
);
}
}
void
fused_adam_cuda
(
at
::
Tensor
&
p
,
at
::
Tensor
&
m
,
at
::
Tensor
&
v
,
at
::
Tensor
&
g
,
float
lr
,
float
beta1
,
float
beta2
,
float
eps
,
float
grad_scale
,
int
step
,
int
bias_correction
,
float
decay
)
{
//Get tensor size
int
tsize
=
p
.
numel
();
//Determine #threads and #blocks
const
int
threadsPerBlock
=
512
;
const
dim3
blocks
((
tsize
+
threadsPerBlock
-
1
)
/
threadsPerBlock
);
AT_ASSERTM
(
at
::
cuda
::
detail
::
canUse32BitIndexMath
(
p
),
"parameter tensor is too large to be indexed with int32"
);
//Constants
float
step_size
=
lr
;
if
(
bias_correction
==
1
)
{
const
double
bias_correction1
=
1.0
-
std
::
pow
(
static_cast
<
double
>
(
beta1
),
step
);
const
double
bias_correction2
=
1.0
-
std
::
pow
(
static_cast
<
double
>
(
beta2
),
step
);
step_size
=
static_cast
<
float
>
(
lr
*
std
::
sqrt
(
bias_correction2
)
/
bias_correction1
);
}
float
decay_size
=
1.0
;
if
(
decay
!=
0.0
)
{
decay_size
=
1.0
-
step_size
*
decay
;
}
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
if
(
g
.
scalar_type
()
==
at
::
ScalarType
::
Half
||
g
.
scalar_type
()
==
at
::
ScalarType
::
BFloat16
)
{
AT_ASSERTM
(
p
.
scalar_type
()
==
g
.
scalar_type
(),
"expected parameter to be the same type as grad"
);
using
namespace
at
;
// prevents "toString is undefined" errors
DISPATCH_FLOAT_AND_HALF_AND_BF16
(
g
.
scalar_type
(),
0
,
"adam_cuda_kernel"
,
using
accscalar_t
=
at
::
acc_type
<
scalar_t_0
,
true
>
;
adam_cuda_kernel
<
accscalar_t
,
scalar_t_0
><<<
blocks
,
threadsPerBlock
,
0
,
stream
>>>
(
p
.
data_ptr
<
scalar_t_0
>
(),
m
.
data_ptr
<
accscalar_t
>
(),
v
.
data_ptr
<
accscalar_t
>
(),
g
.
data_ptr
<
scalar_t_0
>
(),
beta1
,
beta2
,
eps
,
grad_scale
,
step_size
,
tsize
,
decay_size
);
);
}
else
{
using
namespace
at
;
DISPATCH_DOUBLE_AND_FLOAT
(
g
.
scalar_type
(),
0
,
"adam_cuda_kernel"
,
adam_cuda_kernel
<
scalar_t_0
,
scalar_t_0
><<<
blocks
,
threadsPerBlock
,
0
,
stream
>>>
(
p
.
data_ptr
<
scalar_t_0
>
(),
m
.
data_ptr
<
scalar_t_0
>
(),
v
.
data_ptr
<
scalar_t_0
>
(),
g
.
data_ptr
<
scalar_t_0
>
(),
beta1
,
beta2
,
eps
,
grad_scale
,
step_size
,
tsize
,
decay_size
);
);
}
AT_CUDA_CHECK
(
cudaGetLastError
());
}
Uni-Core-main/csrc/bak/adam_kernel.hip
0 → 100644
View file @
a1c29028
// !!! This is a file automatically generated by hipify!!!
#include "ATen/ATen.h"
#include "ATen/hip/HIPContext.h"
#include "ATen/hip/detail/IndexUtils.cuh"
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#include <stdio.h>
#include <cmath>
#include "ATen/TensorUtils.h"
#include "ATen/AccumulateType.h"
#include <ATen/hip/Exceptions.h>
#include "type_shim.h"
template <typename T, typename GRAD_T>
__global__ void adam_cuda_kernel(
GRAD_T* __restrict__ p,
T* __restrict__ m,
T* __restrict__ v,
const GRAD_T * __restrict__ g,
const float b1,
const float b2,
const float eps,
const float grad_scale,
const float step_size,
const size_t tsize,
const float decay_size)
{
//Assuming 2D grids and 2D blocks
const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
const int threadsPerBlock = blockDim.x * blockDim.y;
const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
const int i = (blockId * threadsPerBlock + threadIdInBlock);
const int totThreads = gridDim.x*gridDim.y*threadsPerBlock;
for (int j = i; j < tsize; j+=totThreads) {
// weight decay
T cur_p = (T)p[j] * decay_size;
T scaled_grad = static_cast<T>(g[j]) / grad_scale;
m[j] = b1*m[j] + (1-b1)*scaled_grad;
v[j] = b2*v[j] + (1-b2)*scaled_grad*scaled_grad;
const float update = m[j] / (sqrtf(v[j]) + eps);
p[j] = cur_p - (step_size*update);
}
}
void fused_adam_cuda(
at::Tensor & p,
at::Tensor & m,
at::Tensor & v,
at::Tensor & g,
float lr,
float beta1,
float beta2,
float eps,
float grad_scale,
int step,
int bias_correction,
float decay)
{
//Get tensor size
int tsize = p.numel();
//Determine #threads and #blocks
const int threadsPerBlock = 512;
const dim3 blocks((tsize+threadsPerBlock-1)/threadsPerBlock);
AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(p), "parameter tensor is too large to be indexed with int32");
//Constants
float step_size = lr;
if (bias_correction == 1) {
const double bias_correction1 = 1.0 - ::pow(static_cast<double>(beta1), step);
const double bias_correction2 = 1.0 - ::pow(static_cast<double>(beta2), step);
step_size = static_cast<float>(lr * std::sqrt(bias_correction2) / bias_correction1);
}
float decay_size = 1.0;
if (decay != 0.0) {
decay_size = 1.0 - step_size * decay;
}
hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
if (g.scalar_type() == at::ScalarType::Half || g.scalar_type() == at::ScalarType::BFloat16) {
AT_ASSERTM(p.scalar_type() == g.scalar_type(), "expected parameter to be the same type as grad");
using namespace at; // prevents "toString is undefined" errors
DISPATCH_FLOAT_AND_HALF_AND_BF16(g.scalar_type(), 0, "adam_cuda_kernel",
using accscalar_t = at::acc_type<scalar_t_0, true>;
hipLaunchKernelGGL(( adam_cuda_kernel<accscalar_t, scalar_t_0>), dim3(blocks),dim3(threadsPerBlock), 0, stream,
p.data_ptr<scalar_t_0>(),
m.data_ptr<accscalar_t>(),
v.data_ptr<accscalar_t>(),
g.data_ptr<scalar_t_0>(),
beta1,
beta2,
eps,
grad_scale,
step_size,
tsize,
decay_size);
);
} else {
using namespace at;
DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
hipLaunchKernelGGL(( adam_cuda_kernel<scalar_t_0, scalar_t_0>), dim3(blocks),dim3(threadsPerBlock), 0, stream,
p.data_ptr<scalar_t_0>(),
m.data_ptr<scalar_t_0>(),
v.data_ptr<scalar_t_0>(),
g.data_ptr<scalar_t_0>(),
beta1,
beta2,
eps,
grad_scale,
step_size,
tsize,
decay_size);
);
}
AT_CUDA_CHECK(hipGetLastError());
}
Uni-Core-main/csrc/bak/adam_kernel_hip.cpp
0 → 100644
View file @
a1c29028
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "ATen/ATen.h"
#include "ATen/hip/HIPContext.h"
#include "ATen/hip/detail/IndexUtils.cuh"
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#include <stdio.h>
#include <cmath>
#include "ATen/TensorUtils.h"
#include "ATen/AccumulateType.h"
#include <ATen/hip/Exceptions.h>
#include "type_shim.h"
template
<
typename
T
,
typename
GRAD_T
>
__global__
void
adam_cuda_kernel
(
GRAD_T
*
__restrict__
p
,
T
*
__restrict__
m
,
T
*
__restrict__
v
,
const
GRAD_T
*
__restrict__
g
,
const
float
b1
,
const
float
b2
,
const
float
eps
,
const
float
grad_scale
,
const
float
step_size
,
const
size_t
tsize
,
const
float
decay_size
)
{
//Assuming 2D grids and 2D blocks
const
int
blockId
=
gridDim
.
x
*
blockIdx
.
y
+
blockIdx
.
x
;
const
int
threadsPerBlock
=
blockDim
.
x
*
blockDim
.
y
;
const
int
threadIdInBlock
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
i
=
(
blockId
*
threadsPerBlock
+
threadIdInBlock
);
const
int
totThreads
=
gridDim
.
x
*
gridDim
.
y
*
threadsPerBlock
;
for
(
int
j
=
i
;
j
<
tsize
;
j
+=
totThreads
)
{
// weight decay
T
cur_p
=
(
T
)
p
[
j
]
*
decay_size
;
T
scaled_grad
=
static_cast
<
T
>
(
g
[
j
])
/
grad_scale
;
m
[
j
]
=
b1
*
m
[
j
]
+
(
1
-
b1
)
*
scaled_grad
;
v
[
j
]
=
b2
*
v
[
j
]
+
(
1
-
b2
)
*
scaled_grad
*
scaled_grad
;
const
float
update
=
m
[
j
]
/
(
sqrtf
(
v
[
j
])
+
eps
);
p
[
j
]
=
cur_p
-
(
step_size
*
update
);
}
}
void
fused_adam_cuda
(
at
::
Tensor
&
p
,
at
::
Tensor
&
m
,
at
::
Tensor
&
v
,
at
::
Tensor
&
g
,
float
lr
,
float
beta1
,
float
beta2
,
float
eps
,
float
grad_scale
,
int
step
,
int
bias_correction
,
float
decay
)
{
//Get tensor size
int
tsize
=
p
.
numel
();
//Determine #threads and #blocks
const
int
threadsPerBlock
=
512
;
const
dim3
blocks
((
tsize
+
threadsPerBlock
-
1
)
/
threadsPerBlock
);
AT_ASSERTM
(
at
::
cuda
::
detail
::
canUse32BitIndexMath
(
p
),
"parameter tensor is too large to be indexed with int32"
);
//Constants
float
step_size
=
lr
;
if
(
bias_correction
==
1
)
{
const
double
bias_correction1
=
1.0
-
std
::
pow
(
static_cast
<
double
>
(
beta1
),
step
);
const
double
bias_correction2
=
1.0
-
std
::
pow
(
static_cast
<
double
>
(
beta2
),
step
);
step_size
=
static_cast
<
float
>
(
lr
*
std
::
sqrt
(
bias_correction2
)
/
bias_correction1
);
}
float
decay_size
=
1.0
;
if
(
decay
!=
0.0
)
{
decay_size
=
1.0
-
step_size
*
decay
;
}
hipStream_t
stream
=
at
::
hip
::
getCurrentHIPStreamMasqueradingAsCUDA
();
if
(
g
.
scalar_type
()
==
at
::
ScalarType
::
Half
||
g
.
scalar_type
()
==
at
::
ScalarType
::
BFloat16
)
{
AT_ASSERTM
(
p
.
scalar_type
()
==
g
.
scalar_type
(),
"expected parameter to be the same type as grad"
);
using
namespace
at
;
// prevents "toString is undefined" errors
DISPATCH_FLOAT_AND_HALF_AND_BF16
(
g
.
scalar_type
(),
0
,
"adam_cuda_kernel"
,
using
accscalar_t
=
at
::
acc_type
<
scalar_t_0
,
true
>
;
hipLaunchKernelGGL
((
adam_cuda_kernel
<
accscalar_t
,
scalar_t_0
>
),
dim3
(
blocks
),
dim3
(
threadsPerBlock
),
0
,
stream
,
p
.
data_ptr
<
scalar_t_0
>
(),
m
.
data_ptr
<
accscalar_t
>
(),
v
.
data_ptr
<
accscalar_t
>
(),
g
.
data_ptr
<
scalar_t_0
>
(),
beta1
,
beta2
,
eps
,
grad_scale
,
step_size
,
tsize
,
decay_size
);
);
}
else
{
using
namespace
at
;
DISPATCH_DOUBLE_AND_FLOAT
(
g
.
scalar_type
(),
0
,
"adam_cuda_kernel"
,
hipLaunchKernelGGL
((
adam_cuda_kernel
<
scalar_t_0
,
scalar_t_0
>
),
dim3
(
blocks
),
dim3
(
threadsPerBlock
),
0
,
stream
,
p
.
data_ptr
<
scalar_t_0
>
(),
m
.
data_ptr
<
scalar_t_0
>
(),
v
.
data_ptr
<
scalar_t_0
>
(),
g
.
data_ptr
<
scalar_t_0
>
(),
beta1
,
beta2
,
eps
,
grad_scale
,
step_size
,
tsize
,
decay_size
);
);
}
AT_CUDA_CHECK
(
hipGetLastError
());
}
Uni-Core-main/csrc/bak/multi_tensor/interface.cpp
0 → 100644
View file @
a1c29028
#include <torch/extension.h>
at
::
Tensor
multi_tensor_l2norm_cuda
(
int
chunk_size
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
);
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"l2norm"
,
&
multi_tensor_l2norm_cuda
,
"Computes L2 norm for a list of contiguous tensors"
);
}
\ No newline at end of file
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply.cuh
0 → 100644
View file @
a1c29028
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
#include <assert.h>
#include <iostream>
constexpr
int
depth_to_max_tensors
[
5
]
=
{
110
,
64
,
48
,
36
,
30
};
constexpr
int
depth_to_max_blocks
[
5
]
=
{
320
,
320
,
320
,
320
,
320
};
template
<
int
n
>
struct
TensorListMetadata
{
void
*
addresses
[
n
][
depth_to_max_tensors
[
n
-
1
]];
int
sizes
[
depth_to_max_tensors
[
n
-
1
]];
unsigned
char
block_to_tensor
[
depth_to_max_blocks
[
n
-
1
]];
int
block_to_chunk
[
depth_to_max_blocks
[
n
-
1
]];
int
start_tensor_this_launch
;
};
template
<
typename
T
,
typename
U
,
typename
...
ArgTypes
>
__global__
void
multi_tensor_apply_kernel
(
int
chunk_size
,
T
tl
,
U
callable
,
ArgTypes
...
args
)
{
callable
(
chunk_size
,
tl
,
args
...);
}
template
<
int
depth
,
typename
T
,
typename
...
ArgTypes
>
void
multi_tensor_apply
(
int
block_size
,
int
chunk_size
,
const
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>&
tensor_lists
,
T
callable
,
ArgTypes
...
args
)
{
TORCH_CHECK
(
tensor_lists
.
size
()
==
depth
,
"tensor_lists.size() != depth"
);
int
len0
=
tensor_lists
[
0
].
size
();
TORCH_CHECK
(
len0
>
0
,
"tensor_lists[0].size() is not > 0"
);
auto
ref_device
=
tensor_lists
[
0
][
0
].
device
();
TORCH_CHECK
(
ref_device
.
type
()
==
at
::
kCUDA
,
"expected input to be on cuda"
);
auto
ref_dtype
=
tensor_lists
[
0
][
0
].
scalar_type
();
for
(
int
l
=
0
;
l
<
tensor_lists
.
size
();
l
++
)
{
TORCH_CHECK
(
tensor_lists
[
l
].
size
()
==
len0
,
"Size mismatch among tensor lists"
);
for
(
int
t
=
0
;
t
<
tensor_lists
[
l
].
size
();
t
++
)
{
bool
contiguous_memory
=
tensor_lists
[
l
][
t
].
is_contiguous
();
#ifdef VERSION_GE_1_5
contiguous_memory
=
(
contiguous_memory
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast
)
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast3d
));
#endif
TORCH_CHECK
(
contiguous_memory
,
"A tensor was not contiguous."
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
device
()
==
ref_device
,
"A tensor was not on the same device as the first tensor"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
scalar_type
()
==
ref_dtype
,
"A tensor was not the same dtype as the first tensor"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
numel
()
==
tensor_lists
[
0
][
t
].
numel
(),
"Size mismatch"
);
}
}
int
ntensors
=
tensor_lists
[
0
].
size
();
TensorListMetadata
<
depth
>
tl
;
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
tensor_lists
[
0
][
0
]));
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
tl
.
start_tensor_this_launch
=
0
;
int
loc_block_info
=
0
;
int
loc_tensor_info
=
0
;
for
(
int
t
=
0
;
t
<
ntensors
;
t
++
)
{
tl
.
sizes
[
loc_tensor_info
]
=
tensor_lists
[
0
][
t
].
numel
();
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
loc_tensor_info
]
=
tensor_lists
[
d
][
t
].
data_ptr
();
loc_tensor_info
++
;
int
chunks_this_tensor
=
(
tensor_lists
[
0
][
t
].
numel
()
+
chunk_size
-
1
)
/
chunk_size
;
for
(
int
chunk
=
0
;
chunk
<
chunks_this_tensor
;
chunk
++
)
{
tl
.
block_to_tensor
[
loc_block_info
]
=
loc_tensor_info
-
1
;
tl
.
block_to_chunk
[
loc_block_info
]
=
chunk
;
loc_block_info
++
;
bool
tensors_full
=
(
loc_tensor_info
==
depth_to_max_tensors
[
depth
-
1
]
&&
chunk
==
chunks_this_tensor
-
1
);
bool
blocks_full
=
(
loc_block_info
==
depth_to_max_blocks
[
depth
-
1
]);
bool
last_chunk
=
(
t
==
ntensors
-
1
&&
chunk
==
chunks_this_tensor
-
1
);
if
(
tensors_full
||
blocks_full
||
last_chunk
)
{
multi_tensor_apply_kernel
<<<
loc_block_info
,
block_size
,
0
,
stream
>>>
(
chunk_size
,
tl
,
callable
,
args
...);
AT_CUDA_CHECK
(
cudaGetLastError
());
loc_block_info
=
0
;
if
(
chunk
==
chunks_this_tensor
-
1
)
{
loc_tensor_info
=
0
;
tl
.
start_tensor_this_launch
=
t
+
1
;
}
else
{
tl
.
sizes
[
0
]
=
tl
.
sizes
[
loc_tensor_info
-
1
];
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
0
]
=
tl
.
addresses
[
d
][
loc_tensor_info
-
1
];
loc_tensor_info
=
1
;
tl
.
start_tensor_this_launch
=
t
;
}
}
}
}
}
\ No newline at end of file
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply.h
0 → 100644
View file @
a1c29028
#include "hip/hip_runtime.h"
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/HIPContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
#include <assert.h>
#include <iostream>
constexpr
int
depth_to_max_tensors
[
5
]
=
{
110
,
64
,
48
,
36
,
30
};
constexpr
int
depth_to_max_blocks
[
5
]
=
{
320
,
320
,
320
,
320
,
320
};
template
<
int
n
>
struct
TensorListMetadata
{
void
*
addresses
[
n
][
depth_to_max_tensors
[
n
-
1
]];
int
sizes
[
depth_to_max_tensors
[
n
-
1
]];
unsigned
char
block_to_tensor
[
depth_to_max_blocks
[
n
-
1
]];
int
block_to_chunk
[
depth_to_max_blocks
[
n
-
1
]];
int
start_tensor_this_launch
;
};
template
<
typename
T
,
typename
U
,
typename
...
ArgTypes
>
__global__
void
multi_tensor_apply_kernel
(
int
chunk_size
,
T
tl
,
U
callable
,
ArgTypes
...
args
)
{
callable
(
chunk_size
,
tl
,
args
...);
}
template
<
int
depth
,
typename
T
,
typename
...
ArgTypes
>
void
multi_tensor_apply
(
int
block_size
,
int
chunk_size
,
const
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>&
tensor_lists
,
T
callable
,
ArgTypes
...
args
)
{
TORCH_CHECK
(
tensor_lists
.
size
()
==
depth
,
"tensor_lists.size() != depth"
);
int
len0
=
tensor_lists
[
0
].
size
();
TORCH_CHECK
(
len0
>
0
,
"tensor_lists[0].size() is not > 0"
);
auto
ref_device
=
tensor_lists
[
0
][
0
].
device
();
TORCH_CHECK
(
ref_device
.
type
()
==
at
::
kCUDA
,
"expected input to be on cuda"
);
auto
ref_dtype
=
tensor_lists
[
0
][
0
].
scalar_type
();
for
(
int
l
=
0
;
l
<
tensor_lists
.
size
();
l
++
)
{
TORCH_CHECK
(
tensor_lists
[
l
].
size
()
==
len0
,
"Size mismatch among tensor lists"
);
for
(
int
t
=
0
;
t
<
tensor_lists
[
l
].
size
();
t
++
)
{
bool
contiguous_memory
=
tensor_lists
[
l
][
t
].
is_contiguous
();
#ifdef VERSION_GE_1_5
contiguous_memory
=
(
contiguous_memory
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast
)
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast3d
));
#endif
TORCH_CHECK
(
contiguous_memory
,
"A tensor was not contiguous."
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
device
()
==
ref_device
,
"A tensor was not on the same device as the first tensor"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
scalar_type
()
==
ref_dtype
,
"A tensor was not the same dtype as the first tensor"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
numel
()
==
tensor_lists
[
0
][
t
].
numel
(),
"Size mismatch"
);
}
}
int
ntensors
=
tensor_lists
[
0
].
size
();
TensorListMetadata
<
depth
>
tl
;
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
tensor_lists
[
0
][
0
]));
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
tl
.
start_tensor_this_launch
=
0
;
int
loc_block_info
=
0
;
int
loc_tensor_info
=
0
;
for
(
int
t
=
0
;
t
<
ntensors
;
t
++
)
{
tl
.
sizes
[
loc_tensor_info
]
=
tensor_lists
[
0
][
t
].
numel
();
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
loc_tensor_info
]
=
tensor_lists
[
d
][
t
].
data_ptr
();
loc_tensor_info
++
;
int
chunks_this_tensor
=
(
tensor_lists
[
0
][
t
].
numel
()
+
chunk_size
-
1
)
/
chunk_size
;
for
(
int
chunk
=
0
;
chunk
<
chunks_this_tensor
;
chunk
++
)
{
tl
.
block_to_tensor
[
loc_block_info
]
=
loc_tensor_info
-
1
;
tl
.
block_to_chunk
[
loc_block_info
]
=
chunk
;
loc_block_info
++
;
bool
tensors_full
=
(
loc_tensor_info
==
depth_to_max_tensors
[
depth
-
1
]
&&
chunk
==
chunks_this_tensor
-
1
);
bool
blocks_full
=
(
loc_block_info
==
depth_to_max_blocks
[
depth
-
1
]);
bool
last_chunk
=
(
t
==
ntensors
-
1
&&
chunk
==
chunks_this_tensor
-
1
);
if
(
tensors_full
||
blocks_full
||
last_chunk
)
{
multi_tensor_apply_kernel
<<<
loc_block_info
,
block_size
,
0
,
stream
>>>
(
chunk_size
,
tl
,
callable
,
args
...);
AT_CUDA_CHECK
(
hipGetLastError
());
loc_block_info
=
0
;
if
(
chunk
==
chunks_this_tensor
-
1
)
{
loc_tensor_info
=
0
;
tl
.
start_tensor_this_launch
=
t
+
1
;
}
else
{
tl
.
sizes
[
0
]
=
tl
.
sizes
[
loc_tensor_info
-
1
];
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
0
]
=
tl
.
addresses
[
d
][
loc_tensor_info
-
1
];
loc_tensor_info
=
1
;
tl
.
start_tensor_this_launch
=
t
;
}
}
}
}
}
\ No newline at end of file
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply_hip.cuh
0 → 100644
View file @
a1c29028
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/Exceptions.h>
#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
#include <assert.h>
#include <iostream>
constexpr
int
depth_to_max_tensors
[
5
]
=
{
110
,
64
,
48
,
36
,
30
};
constexpr
int
depth_to_max_blocks
[
5
]
=
{
320
,
320
,
320
,
320
,
320
};
template
<
int
n
>
struct
TensorListMetadata
{
void
*
addresses
[
n
][
depth_to_max_tensors
[
n
-
1
]];
int
sizes
[
depth_to_max_tensors
[
n
-
1
]];
unsigned
char
block_to_tensor
[
depth_to_max_blocks
[
n
-
1
]];
int
block_to_chunk
[
depth_to_max_blocks
[
n
-
1
]];
int
start_tensor_this_launch
;
};
template
<
typename
T
,
typename
U
,
typename
...
ArgTypes
>
__global__
void
multi_tensor_apply_kernel
(
int
chunk_size
,
T
tl
,
U
callable
,
ArgTypes
...
args
)
{
callable
(
chunk_size
,
tl
,
args
...);
}
template
<
int
depth
,
typename
T
,
typename
...
ArgTypes
>
void
multi_tensor_apply
(
int
block_size
,
int
chunk_size
,
const
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>&
tensor_lists
,
T
callable
,
ArgTypes
...
args
)
{
TORCH_CHECK
(
tensor_lists
.
size
()
==
depth
,
"tensor_lists.size() != depth"
);
int
len0
=
tensor_lists
[
0
].
size
();
TORCH_CHECK
(
len0
>
0
,
"tensor_lists[0].size() is not > 0"
);
auto
ref_device
=
tensor_lists
[
0
][
0
].
device
();
TORCH_CHECK
(
ref_device
.
type
()
==
at
::
kCUDA
,
"expected input to be on cuda"
);
auto
ref_dtype
=
tensor_lists
[
0
][
0
].
scalar_type
();
for
(
int
l
=
0
;
l
<
tensor_lists
.
size
();
l
++
)
{
TORCH_CHECK
(
tensor_lists
[
l
].
size
()
==
len0
,
"Size mismatch among tensor lists"
);
for
(
int
t
=
0
;
t
<
tensor_lists
[
l
].
size
();
t
++
)
{
bool
contiguous_memory
=
tensor_lists
[
l
][
t
].
is_contiguous
();
#ifdef VERSION_GE_1_5
contiguous_memory
=
(
contiguous_memory
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast
)
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast3d
));
#endif
TORCH_CHECK
(
contiguous_memory
,
"A tensor was not contiguous."
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
device
()
==
ref_device
,
"A tensor was not on the same device as the first tensor"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
scalar_type
()
==
ref_dtype
,
"A tensor was not the same dtype as the first tensor"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
numel
()
==
tensor_lists
[
0
][
t
].
numel
(),
"Size mismatch"
);
}
}
int
ntensors
=
tensor_lists
[
0
].
size
();
TensorListMetadata
<
depth
>
tl
;
const
at
::
hip
::
OptionalHIPGuardMasqueradingAsCUDA
device_guard
(
device_of
(
tensor_lists
[
0
][
0
]));
auto
stream
=
at
::
hip
::
getCurrentHIPStreamMasqueradingAsCUDA
();
tl
.
start_tensor_this_launch
=
0
;
int
loc_block_info
=
0
;
int
loc_tensor_info
=
0
;
for
(
int
t
=
0
;
t
<
ntensors
;
t
++
)
{
tl
.
sizes
[
loc_tensor_info
]
=
tensor_lists
[
0
][
t
].
numel
();
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
loc_tensor_info
]
=
tensor_lists
[
d
][
t
].
data_ptr
();
loc_tensor_info
++
;
int
chunks_this_tensor
=
(
tensor_lists
[
0
][
t
].
numel
()
+
chunk_size
-
1
)
/
chunk_size
;
for
(
int
chunk
=
0
;
chunk
<
chunks_this_tensor
;
chunk
++
)
{
tl
.
block_to_tensor
[
loc_block_info
]
=
loc_tensor_info
-
1
;
tl
.
block_to_chunk
[
loc_block_info
]
=
chunk
;
loc_block_info
++
;
bool
tensors_full
=
(
loc_tensor_info
==
depth_to_max_tensors
[
depth
-
1
]
&&
chunk
==
chunks_this_tensor
-
1
);
bool
blocks_full
=
(
loc_block_info
==
depth_to_max_blocks
[
depth
-
1
]);
bool
last_chunk
=
(
t
==
ntensors
-
1
&&
chunk
==
chunks_this_tensor
-
1
);
if
(
tensors_full
||
blocks_full
||
last_chunk
)
{
hipLaunchKernelGGL
((
multi_tensor_apply_kernel
),
dim3
(
loc_block_info
),
dim3
(
block_size
),
0
,
stream
,
chunk_size
,
tl
,
callable
,
args
...);
AT_CUDA_CHECK
(
hipGetLastError
());
loc_block_info
=
0
;
if
(
chunk
==
chunks_this_tensor
-
1
)
{
loc_tensor_info
=
0
;
tl
.
start_tensor_this_launch
=
t
+
1
;
}
else
{
tl
.
sizes
[
0
]
=
tl
.
sizes
[
loc_tensor_info
-
1
];
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
0
]
=
tl
.
addresses
[
d
][
loc_tensor_info
-
1
];
loc_tensor_info
=
1
;
tl
.
start_tensor_this_launch
=
t
;
}
}
}
}
}
\ No newline at end of file
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_apply_hip.h
0 → 100644
View file @
a1c29028
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/Exceptions.h>
#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
#include <assert.h>
#include <iostream>
constexpr
int
depth_to_max_tensors
[
5
]
=
{
110
,
64
,
48
,
36
,
30
};
constexpr
int
depth_to_max_blocks
[
5
]
=
{
320
,
320
,
320
,
320
,
320
};
template
<
int
n
>
struct
TensorListMetadata
{
void
*
addresses
[
n
][
depth_to_max_tensors
[
n
-
1
]];
int
sizes
[
depth_to_max_tensors
[
n
-
1
]];
unsigned
char
block_to_tensor
[
depth_to_max_blocks
[
n
-
1
]];
int
block_to_chunk
[
depth_to_max_blocks
[
n
-
1
]];
int
start_tensor_this_launch
;
};
template
<
typename
T
,
typename
U
,
typename
...
ArgTypes
>
__global__
void
multi_tensor_apply_kernel
(
int
chunk_size
,
T
tl
,
U
callable
,
ArgTypes
...
args
)
{
callable
(
chunk_size
,
tl
,
args
...);
}
template
<
int
depth
,
typename
T
,
typename
...
ArgTypes
>
void
multi_tensor_apply
(
int
block_size
,
int
chunk_size
,
const
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>&
tensor_lists
,
T
callable
,
ArgTypes
...
args
)
{
TORCH_CHECK
(
tensor_lists
.
size
()
==
depth
,
"tensor_lists.size() != depth"
);
int
len0
=
tensor_lists
[
0
].
size
();
TORCH_CHECK
(
len0
>
0
,
"tensor_lists[0].size() is not > 0"
);
auto
ref_device
=
tensor_lists
[
0
][
0
].
device
();
TORCH_CHECK
(
ref_device
.
type
()
==
at
::
kCUDA
,
"expected input to be on cuda"
);
auto
ref_dtype
=
tensor_lists
[
0
][
0
].
scalar_type
();
for
(
int
l
=
0
;
l
<
tensor_lists
.
size
();
l
++
)
{
TORCH_CHECK
(
tensor_lists
[
l
].
size
()
==
len0
,
"Size mismatch among tensor lists"
);
for
(
int
t
=
0
;
t
<
tensor_lists
[
l
].
size
();
t
++
)
{
bool
contiguous_memory
=
tensor_lists
[
l
][
t
].
is_contiguous
();
#ifdef VERSION_GE_1_5
contiguous_memory
=
(
contiguous_memory
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast
)
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast3d
));
#endif
TORCH_CHECK
(
contiguous_memory
,
"A tensor was not contiguous."
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
device
()
==
ref_device
,
"A tensor was not on the same device as the first tensor"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
scalar_type
()
==
ref_dtype
,
"A tensor was not the same dtype as the first tensor"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
numel
()
==
tensor_lists
[
0
][
t
].
numel
(),
"Size mismatch"
);
}
}
int
ntensors
=
tensor_lists
[
0
].
size
();
TensorListMetadata
<
depth
>
tl
;
const
at
::
hip
::
OptionalHIPGuardMasqueradingAsCUDA
device_guard
(
device_of
(
tensor_lists
[
0
][
0
]));
auto
stream
=
at
::
hip
::
getCurrentHIPStreamMasqueradingAsCUDA
();
tl
.
start_tensor_this_launch
=
0
;
int
loc_block_info
=
0
;
int
loc_tensor_info
=
0
;
for
(
int
t
=
0
;
t
<
ntensors
;
t
++
)
{
tl
.
sizes
[
loc_tensor_info
]
=
tensor_lists
[
0
][
t
].
numel
();
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
loc_tensor_info
]
=
tensor_lists
[
d
][
t
].
data_ptr
();
loc_tensor_info
++
;
int
chunks_this_tensor
=
(
tensor_lists
[
0
][
t
].
numel
()
+
chunk_size
-
1
)
/
chunk_size
;
for
(
int
chunk
=
0
;
chunk
<
chunks_this_tensor
;
chunk
++
)
{
tl
.
block_to_tensor
[
loc_block_info
]
=
loc_tensor_info
-
1
;
tl
.
block_to_chunk
[
loc_block_info
]
=
chunk
;
loc_block_info
++
;
bool
tensors_full
=
(
loc_tensor_info
==
depth_to_max_tensors
[
depth
-
1
]
&&
chunk
==
chunks_this_tensor
-
1
);
bool
blocks_full
=
(
loc_block_info
==
depth_to_max_blocks
[
depth
-
1
]);
bool
last_chunk
=
(
t
==
ntensors
-
1
&&
chunk
==
chunks_this_tensor
-
1
);
if
(
tensors_full
||
blocks_full
||
last_chunk
)
{
hipLaunchKernelGGL
((
multi_tensor_apply_kernel
),
dim3
(
loc_block_info
),
dim3
(
block_size
),
0
,
stream
,
chunk_size
,
tl
,
callable
,
args
...);
AT_CUDA_CHECK
(
hipGetLastError
());
loc_block_info
=
0
;
if
(
chunk
==
chunks_this_tensor
-
1
)
{
loc_tensor_info
=
0
;
tl
.
start_tensor_this_launch
=
t
+
1
;
}
else
{
tl
.
sizes
[
0
]
=
tl
.
sizes
[
loc_tensor_info
-
1
];
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
0
]
=
tl
.
addresses
[
d
][
loc_tensor_info
-
1
];
loc_tensor_info
=
1
;
tl
.
start_tensor_this_launch
=
t
;
}
}
}
}
}
\ No newline at end of file
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.cpp
0 → 100644
View file @
a1c29028
#include "hip/hip_runtime.h"
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/HIPContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda_bf16.h>
#include <assert.h>
#include "type_shim.h"
#include "multi_tensor_apply.cuh"
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
x_t
>
struct
L2NormFunctor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
TensorListMetadata
<
1
>&
tl
,
float
*
output
)
{
int
tensor_loc
=
tl
.
block_to_tensor
[
blockIdx
.
x
];
int
chunk_idx
=
tl
.
block_to_chunk
[
blockIdx
.
x
];
int
n
=
tl
.
sizes
[
tensor_loc
];
x_t
*
x
=
(
x_t
*
)
tl
.
addresses
[
0
][
tensor_loc
];
x
+=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
__shared__
float
s_vals
[
512
];
float
vals
[
ILP
];
x_t
r_x
[
ILP
];
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
vals
[
i
]
=
0.0
f
;
r_x
[
i
]
=
(
x_t
)
0.0
f
;
}
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
// load
load_store
(
r_x
,
x
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
next
=
static_cast
<
float
>
(
r_x
[
ii
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
}
float
val
=
0.
f
;
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
val
+=
vals
[
i
];
float
res
=
reduce_block_into_lanes
(
s_vals
,
val
);
if
(
threadIdx
.
x
==
0
)
{
output
[
blockIdx
.
x
]
+=
res
;
}
}
};
__global__
void
cleanup
(
float
*
output
,
float
*
ret
)
{
__shared__
float
vals
[
512
];
if
(
blockIdx
.
x
==
0
)
{
float
val
=
0
;
if
(
threadIdx
.
x
<
320
)
val
=
output
[
threadIdx
.
x
];
float
final
=
reduce_block_into_lanes
(
vals
,
val
);
if
(
threadIdx
.
x
==
0
)
*
ret
=
sqrt
(
final
);
}
}
at
::
Tensor
multi_tensor_l2norm_cuda
(
int
chunk_size
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
)
{
auto
float_options
=
tensor_lists
[
0
][
0
].
options
().
dtype
(
at
::
kFloat
);
auto
output
=
at
::
zeros
({
320
},
float_options
);
switch
(
tensor_lists
[
0
][
0
].
scalar_type
()){
case
at
::
ScalarType
::
Float
:
{
multi_tensor_apply
<
1
>
(
BLOCK_SIZE
,
chunk_size
,
tensor_lists
,
L2NormFunctor
<
float
>
(),
output
.
data_ptr
<
float
>
()
);
break
;
}
case
at
::
ScalarType
::
Half
:
{
multi_tensor_apply
<
1
>
(
BLOCK_SIZE
,
chunk_size
,
tensor_lists
,
L2NormFunctor
<
half
>
(),
output
.
data_ptr
<
float
>
()
);
break
;
}
case
at
::
ScalarType
::
BFloat16
:
{
multi_tensor_apply
<
1
>
(
BLOCK_SIZE
,
chunk_size
,
tensor_lists
,
L2NormFunctor
<
nv_bfloat16
>
(),
output
.
data_ptr
<
float
>
()
);
break
;
}
}
AT_CUDA_CHECK
(
hipGetLastError
());
auto
ret
=
at
::
empty
({
1
},
output
.
options
());
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
output
));
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cleanup
<<<
1
,
512
,
0
,
stream
>>>
(
output
.
data_ptr
<
float
>
(),
ret
.
data_ptr
<
float
>
());
return
ret
;
}
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.cu
0 → 100644
View file @
a1c29028
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
//#include <cuda_bf16.h>
#include <assert.h>
#include "type_shim.h"
#include "multi_tensor_apply.cuh"
//#define BLOCK_SIZE 512
#define BLOCK_SIZE 256
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
x_t
>
struct
L2NormFunctor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
TensorListMetadata
<
1
>&
tl
,
float
*
output
)
{
int
tensor_loc
=
tl
.
block_to_tensor
[
blockIdx
.
x
];
int
chunk_idx
=
tl
.
block_to_chunk
[
blockIdx
.
x
];
int
n
=
tl
.
sizes
[
tensor_loc
];
x_t
*
x
=
(
x_t
*
)
tl
.
addresses
[
0
][
tensor_loc
];
x
+=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
__shared__
float
s_vals
[
512
];
float
vals
[
ILP
];
x_t
r_x
[
ILP
];
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
vals
[
i
]
=
0.0
f
;
r_x
[
i
]
=
(
x_t
)
0.0
f
;
}
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
// load
load_store
(
r_x
,
x
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
next
=
static_cast
<
float
>
(
r_x
[
ii
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
}
float
val
=
0.
f
;
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
val
+=
vals
[
i
];
float
res
=
reduce_block_into_lanes
(
s_vals
,
val
);
if
(
threadIdx
.
x
==
0
)
{
output
[
blockIdx
.
x
]
+=
res
;
}
}
};
__global__
void
cleanup
(
float
*
output
,
float
*
ret
)
{
__shared__
float
vals
[
512
];
if
(
blockIdx
.
x
==
0
)
{
float
val
=
0
;
if
(
threadIdx
.
x
<
320
)
val
=
output
[
threadIdx
.
x
];
float
final
=
reduce_block_into_lanes
(
vals
,
val
);
if
(
threadIdx
.
x
==
0
)
*
ret
=
sqrt
(
final
);
}
}
at
::
Tensor
multi_tensor_l2norm_cuda
(
int
chunk_size
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
)
{
auto
float_options
=
tensor_lists
[
0
][
0
].
options
().
dtype
(
at
::
kFloat
);
auto
output
=
at
::
zeros
({
320
},
float_options
);
switch
(
tensor_lists
[
0
][
0
].
scalar_type
()){
case
at
::
ScalarType
::
Float
:
{
multi_tensor_apply
<
1
>
(
BLOCK_SIZE
,
chunk_size
,
tensor_lists
,
L2NormFunctor
<
float
>
(),
output
.
data_ptr
<
float
>
()
);
break
;
}
case
at
::
ScalarType
::
Half
:
{
multi_tensor_apply
<
1
>
(
BLOCK_SIZE
,
chunk_size
,
tensor_lists
,
L2NormFunctor
<
half
>
(),
output
.
data_ptr
<
float
>
()
);
break
;
}
case
at
::
ScalarType
::
BFloat16
:
{
multi_tensor_apply
<
1
>
(
BLOCK_SIZE
,
chunk_size
,
tensor_lists
,
L2NormFunctor
<
nv_bfloat16
>
(),
output
.
data_ptr
<
float
>
()
);
break
;
}
}
AT_CUDA_CHECK
(
cudaGetLastError
());
auto
ret
=
at
::
empty
({
1
},
output
.
options
());
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
output
));
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cleanup
<<<
1
,
512
,
0
,
stream
>>>
(
output
.
data_ptr
<
float
>
(),
ret
.
data_ptr
<
float
>
());
return
ret
;
}
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel.hip
0 → 100644
View file @
a1c29028
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/Exceptions.h>
#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
//#include <cuda_bf16.h>
#include <assert.h>
#include "type_shim.h"
#include "multi_tensor_apply_hip.cuh"
//#define BLOCK_SIZE 512
#define BLOCK_SIZE 256
#define ILP 4
template<typename T>
__device__ __forceinline__ bool is_aligned(T* p){
return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
}
template<typename T>
__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
}
template<typename x_t>
struct L2NormFunctor
{
__device__ __forceinline__ void operator()(
int chunk_size,
TensorListMetadata<1>& tl,
float* output)
{
int tensor_loc = tl.block_to_tensor[blockIdx.x];
int chunk_idx = tl.block_to_chunk[blockIdx.x];
int n = tl.sizes[tensor_loc];
x_t* x = (x_t*)tl.addresses[0][tensor_loc];
x += chunk_idx*chunk_size;
n -= chunk_idx*chunk_size;
__shared__ float s_vals[512];
float vals[ILP];
x_t r_x[ILP];
for(int i = 0; i < ILP; i++)
{
vals[i] = 0.0f;
r_x[i] = (x_t)0.0f;
}
if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x))
{
for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
{
// load
load_store(r_x, x, 0 , i_start);
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
float next = static_cast<float>(r_x[ii]);
vals[ii] += next*next;
}
}
}
else
{
for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
{
#pragma unroll
for(int ii = 0; ii < ILP; ii++)
{
int i = i_start + threadIdx.x + ii*blockDim.x;
if(i < n && i < chunk_size)
{
float next = static_cast<float>(x[i]);
vals[ii] += next*next;
}
}
}
}
float val = 0.f;
for(int i = 0; i < ILP; i++)
val += vals[i];
float res = reduce_block_into_lanes(s_vals, val);
if(threadIdx.x == 0)
{
output[blockIdx.x] += res;
}
}
};
__global__ void cleanup(
float* output,
float* ret)
{
__shared__ float vals[512];
if(blockIdx.x == 0)
{
float val = 0;
if(threadIdx.x < 320)
val = output[threadIdx.x];
float final = reduce_block_into_lanes(vals, val);
if(threadIdx.x == 0)
*ret = sqrt(final);
}
}
at::Tensor multi_tensor_l2norm_cuda(
int chunk_size,
std::vector<std::vector<at::Tensor>> tensor_lists)
{
auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
auto output = at::zeros({320}, float_options);
switch (tensor_lists[0][0].scalar_type()){
case at::ScalarType::Float: {
multi_tensor_apply<1>(
BLOCK_SIZE,
chunk_size,
tensor_lists,
L2NormFunctor<float>(),
output.data_ptr<float>()
);
break;
}
case at::ScalarType::Half: {
multi_tensor_apply<1>(
BLOCK_SIZE,
chunk_size,
tensor_lists,
L2NormFunctor<half>(),
output.data_ptr<float>()
);
break;
}
case at::ScalarType::BFloat16: {
multi_tensor_apply<1>(
BLOCK_SIZE,
chunk_size,
tensor_lists,
L2NormFunctor<nv_bfloat16>(),
output.data_ptr<float>()
);
break;
}
}
AT_CUDA_CHECK(hipGetLastError());
auto ret = at::empty({1}, output.options());
const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(output));
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
hipLaunchKernelGGL(( cleanup), dim3(1), dim3(512), 0, stream,
output.data_ptr<float>(),
ret.data_ptr<float>());
return ret;
}
Uni-Core-main/csrc/bak/multi_tensor/multi_tensor_l2norm_kernel_hip.cpp
0 → 100644
View file @
a1c29028
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/Exceptions.h>
#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
#include <cuda_bf16.h>
#include <assert.h>
#include "type_shim.h"
#include "multi_tensor_apply_hip.cuh"
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
){
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
){
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
x_t
>
struct
L2NormFunctor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
TensorListMetadata
<
1
>&
tl
,
float
*
output
)
{
int
tensor_loc
=
tl
.
block_to_tensor
[
blockIdx
.
x
];
int
chunk_idx
=
tl
.
block_to_chunk
[
blockIdx
.
x
];
int
n
=
tl
.
sizes
[
tensor_loc
];
x_t
*
x
=
(
x_t
*
)
tl
.
addresses
[
0
][
tensor_loc
];
x
+=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
__shared__
float
s_vals
[
512
];
float
vals
[
ILP
];
x_t
r_x
[
ILP
];
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
vals
[
i
]
=
0.0
f
;
r_x
[
i
]
=
(
x_t
)
0.0
f
;
}
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
// load
load_store
(
r_x
,
x
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
next
=
static_cast
<
float
>
(
r_x
[
ii
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
}
float
val
=
0.
f
;
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
val
+=
vals
[
i
];
float
res
=
reduce_block_into_lanes
(
s_vals
,
val
);
if
(
threadIdx
.
x
==
0
)
{
output
[
blockIdx
.
x
]
+=
res
;
}
}
};
__global__
void
cleanup
(
float
*
output
,
float
*
ret
)
{
__shared__
float
vals
[
512
];
if
(
blockIdx
.
x
==
0
)
{
float
val
=
0
;
if
(
threadIdx
.
x
<
320
)
val
=
output
[
threadIdx
.
x
];
float
final
=
reduce_block_into_lanes
(
vals
,
val
);
if
(
threadIdx
.
x
==
0
)
*
ret
=
sqrt
(
final
);
}
}
at
::
Tensor
multi_tensor_l2norm_cuda
(
int
chunk_size
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
)
{
auto
float_options
=
tensor_lists
[
0
][
0
].
options
().
dtype
(
at
::
kFloat
);
auto
output
=
at
::
zeros
({
320
},
float_options
);
switch
(
tensor_lists
[
0
][
0
].
scalar_type
()){
case
at
::
ScalarType
::
Float
:
{
multi_tensor_apply
<
1
>
(
BLOCK_SIZE
,
chunk_size
,
tensor_lists
,
L2NormFunctor
<
float
>
(),
output
.
data_ptr
<
float
>
()
);
break
;
}
case
at
::
ScalarType
::
Half
:
{
multi_tensor_apply
<
1
>
(
BLOCK_SIZE
,
chunk_size
,
tensor_lists
,
L2NormFunctor
<
half
>
(),
output
.
data_ptr
<
float
>
()
);
break
;
}
case
at
::
ScalarType
::
BFloat16
:
{
multi_tensor_apply
<
1
>
(
BLOCK_SIZE
,
chunk_size
,
tensor_lists
,
L2NormFunctor
<
nv_bfloat16
>
(),
output
.
data_ptr
<
float
>
()
);
break
;
}
}
AT_CUDA_CHECK
(
hipGetLastError
());
auto
ret
=
at
::
empty
({
1
},
output
.
options
());
const
at
::
hip
::
OptionalHIPGuardMasqueradingAsCUDA
device_guard
(
device_of
(
output
));
auto
stream
=
at
::
hip
::
getCurrentHIPStreamMasqueradingAsCUDA
();
hipLaunchKernelGGL
((
cleanup
),
dim3
(
1
),
dim3
(
512
),
0
,
stream
,
output
.
data_ptr
<
float
>
(),
ret
.
data_ptr
<
float
>
());
return
ret
;
}
Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.cpp
0 → 100644
View file @
a1c29028
#include "hip/hip_runtime.h"
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAGeneratorImpl.h>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/cuda/detail/TensorInfo.cuh>
#include <c10/cuda/CUDAMathCompat.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <cuda_bf16.h>
#include <hiprand_kernel.h>
#include <ATen/cuda/HIPContext.h>
#include <torch/extension.h>
#include <math.h>
#include <iostream>
union
float_int_32
{
uint32_t
i
;
float
f
;
};
__global__
void
fp32_to_bf16
(
const
float
*
input
,
nv_bfloat16
*
output
,
const
int
tsize
,
uint64_t
seed
,
uint64_t
offset
)
{
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
i
<
tsize
)
{
float_int_32
d
;
d
.
f
=
input
[
i
];
hiprandStatePhilox4_32_10_t
state
;
hiprand_init
(
seed
,
i
,
offset
,
&
state
);
d
.
i
+=
hiprand
(
&
state
)
&
0x0000ffff
;
output
[
i
]
=
__float2bfloat16_rz
(
d
.
f
);
}
}
void
fused_fp32_to_bf16_sr_cuda
(
at
::
Tensor
&
input
,
at
::
Tensor
&
output
)
{
int
tsize
=
input
.
numel
();
const
int
threadsPerBlock
=
512
;
const
int
blocks
=
(
tsize
+
threadsPerBlock
-
1
)
/
threadsPerBlock
;
AT_ASSERTM
(
at
::
cuda
::
detail
::
canUse32BitIndexMath
(
input
),
"parameter tensor is too large to be indexed with int32"
);
AT_ASSERTM
(
input
.
scalar_type
()
==
at
::
ScalarType
::
Float
,
"expected input to be float32 tensor"
);
AT_ASSERTM
(
output
.
scalar_type
()
==
at
::
ScalarType
::
BFloat16
,
"expected output to be bfloat16 tensor"
);
auto
gen
=
at
::
cuda
::
detail
::
getDefaultCUDAGenerator
();
std
::
pair
<
uint64_t
,
uint64_t
>
rng_engine_inputs
;
{
// See Note [Acquire lock when using random generators]
std
::
lock_guard
<
std
::
mutex
>
lock
(
gen
.
mutex
());
rng_engine_inputs
=
at
::
check_generator
<
at
::
CUDAGeneratorImpl
>
(
gen
)
->
philox_engine_inputs
(
1
);
}
uint64_t
seed
=
std
::
get
<
0
>
(
rng_engine_inputs
);
uint64_t
offset
=
std
::
get
<
1
>
(
rng_engine_inputs
);
hipStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
fp32_to_bf16
<<<
blocks
,
threadsPerBlock
,
0
,
stream
>>>
(
(
const
float
*
)
input
.
data_ptr
(),
(
nv_bfloat16
*
)
output
.
data_ptr
(),
tsize
,
seed
,
offset
);
AT_CUDA_CHECK
(
hipGetLastError
());
}
Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.cu
0 → 100644
View file @
a1c29028
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAGeneratorImpl.h>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/cuda/detail/TensorInfo.cuh>
#include <c10/cuda/CUDAMathCompat.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#include <curand_kernel.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>
#include <math.h>
#include <iostream>
union
float_int_32
{
uint32_t
i
;
float
f
;
};
__global__
void
fp32_to_bf16
(
const
float
*
input
,
nv_bfloat16
*
output
,
const
int
tsize
,
uint64_t
seed
,
uint64_t
offset
)
{
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
i
<
tsize
)
{
float_int_32
d
;
d
.
f
=
input
[
i
];
curandStatePhilox4_32_10_t
state
;
curand_init
(
seed
,
i
,
offset
,
&
state
);
d
.
i
+=
curand
(
&
state
)
&
0x0000ffff
;
output
[
i
]
=
__float2bfloat16_rz
(
d
.
f
);
}
}
void
fused_fp32_to_bf16_sr_cuda
(
at
::
Tensor
&
input
,
at
::
Tensor
&
output
)
{
int
tsize
=
input
.
numel
();
const
int
threadsPerBlock
=
512
;
const
int
blocks
=
(
tsize
+
threadsPerBlock
-
1
)
/
threadsPerBlock
;
AT_ASSERTM
(
at
::
cuda
::
detail
::
canUse32BitIndexMath
(
input
),
"parameter tensor is too large to be indexed with int32"
);
AT_ASSERTM
(
input
.
scalar_type
()
==
at
::
ScalarType
::
Float
,
"expected input to be float32 tensor"
);
AT_ASSERTM
(
output
.
scalar_type
()
==
at
::
ScalarType
::
BFloat16
,
"expected output to be bfloat16 tensor"
);
auto
gen
=
at
::
cuda
::
detail
::
getDefaultCUDAGenerator
();
std
::
pair
<
uint64_t
,
uint64_t
>
rng_engine_inputs
;
{
// See Note [Acquire lock when using random generators]
std
::
lock_guard
<
std
::
mutex
>
lock
(
gen
.
mutex
());
rng_engine_inputs
=
at
::
check_generator
<
at
::
CUDAGeneratorImpl
>
(
gen
)
->
philox_engine_inputs
(
1
);
}
uint64_t
seed
=
std
::
get
<
0
>
(
rng_engine_inputs
);
uint64_t
offset
=
std
::
get
<
1
>
(
rng_engine_inputs
);
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
fp32_to_bf16
<<<
blocks
,
threadsPerBlock
,
0
,
stream
>>>
(
(
const
float
*
)
input
.
data_ptr
(),
(
nv_bfloat16
*
)
output
.
data_ptr
(),
tsize
,
seed
,
offset
);
AT_CUDA_CHECK
(
cudaGetLastError
());
}
Uni-Core-main/csrc/bak/rounding/fp32_to_bf16.hip
0 → 100644
View file @
a1c29028
// !!! This is a file automatically generated by hipify!!!
#include <vector>
#include <ATen/ATen.h>
#include <ATen/hip/HIPGeneratorImpl.h>
#include <ATen/hip/detail/IndexUtils.cuh>
#include <ATen/hip/detail/TensorInfo.cuh>
#include <c10/hip/HIPMathCompat.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <cuda_bf16.h>
#include <hiprand/hiprand_kernel.h>
#include <ATen/hip/HIPContext.h>
#include <torch/extension.h>
#include <math.h>
#include <iostream>
union float_int_32
{
uint32_t i;
float f;
};
__global__ void fp32_to_bf16(
const float* input,
nv_bfloat16* output,
const int tsize,
uint64_t seed,
uint64_t offset) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < tsize) {
float_int_32 d;
d.f = input[i];
hiprandStatePhilox4_32_10_t state;
hiprand_init(seed, i, offset, &state);
d.i += hiprand(&state) & 0x0000ffff;
output[i] = __float2bfloat16_rz(d.f);
}
}
void fused_fp32_to_bf16_sr_cuda(
at::Tensor & input,
at::Tensor & output)
{
int tsize = input.numel();
const int threadsPerBlock = 512;
const int blocks = (tsize + threadsPerBlock - 1) / threadsPerBlock;
AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(input), "parameter tensor is too large to be indexed with int32");
AT_ASSERTM(input.scalar_type() == at::ScalarType::Float, "expected input to be float32 tensor");
AT_ASSERTM(output.scalar_type() == at::ScalarType::BFloat16, "expected output to be bfloat16 tensor");
auto gen = at::cuda::detail::getDefaultCUDAGenerator();
std::pair<uint64_t, uint64_t> rng_engine_inputs;
{
// See Note [Acquire lock when using random generators]
std::lock_guard<std::mutex> lock(gen.mutex());
rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_engine_inputs(1);
}
uint64_t seed = std::get<0>(rng_engine_inputs);
uint64_t offset = std::get<1>(rng_engine_inputs);
hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
hipLaunchKernelGGL(( fp32_to_bf16), dim3(blocks), dim3(threadsPerBlock), 0, stream,
(const float*)input.data_ptr(),
(nv_bfloat16*)output.data_ptr(),
tsize,
seed,
offset);
AT_CUDA_CHECK(hipGetLastError());
}
Uni-Core-main/csrc/bak/rounding/fp32_to_bf16_hip.cpp
0 → 100644
View file @
a1c29028
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include <vector>
#include <ATen/ATen.h>
#include <ATen/hip/HIPGeneratorImpl.h>
#include <ATen/hip/detail/IndexUtils.cuh>
#include <ATen/hip/detail/TensorInfo.cuh>
#include <c10/hip/HIPMathCompat.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <cuda_bf16.h>
#include <hiprand_kernel.h>
#include <ATen/hip/HIPContext.h>
#include <torch/extension.h>
#include <math.h>
#include <iostream>
union
float_int_32
{
uint32_t
i
;
float
f
;
};
__global__
void
fp32_to_bf16
(
const
float
*
input
,
nv_bfloat16
*
output
,
const
int
tsize
,
uint64_t
seed
,
uint64_t
offset
)
{
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
i
<
tsize
)
{
float_int_32
d
;
d
.
f
=
input
[
i
];
hiprandStatePhilox4_32_10_t
state
;
hiprand_init
(
seed
,
i
,
offset
,
&
state
);
d
.
i
+=
hiprand
(
&
state
)
&
0x0000ffff
;
output
[
i
]
=
__float2bfloat16_rz
(
d
.
f
);
}
}
void
fused_fp32_to_bf16_sr_cuda
(
at
::
Tensor
&
input
,
at
::
Tensor
&
output
)
{
int
tsize
=
input
.
numel
();
const
int
threadsPerBlock
=
512
;
const
int
blocks
=
(
tsize
+
threadsPerBlock
-
1
)
/
threadsPerBlock
;
AT_ASSERTM
(
at
::
cuda
::
detail
::
canUse32BitIndexMath
(
input
),
"parameter tensor is too large to be indexed with int32"
);
AT_ASSERTM
(
input
.
scalar_type
()
==
at
::
ScalarType
::
Float
,
"expected input to be float32 tensor"
);
AT_ASSERTM
(
output
.
scalar_type
()
==
at
::
ScalarType
::
BFloat16
,
"expected output to be bfloat16 tensor"
);
auto
gen
=
at
::
cuda
::
detail
::
getDefaultCUDAGenerator
();
std
::
pair
<
uint64_t
,
uint64_t
>
rng_engine_inputs
;
{
// See Note [Acquire lock when using random generators]
std
::
lock_guard
<
std
::
mutex
>
lock
(
gen
.
mutex
());
rng_engine_inputs
=
at
::
check_generator
<
at
::
CUDAGeneratorImpl
>
(
gen
)
->
philox_engine_inputs
(
1
);
}
uint64_t
seed
=
std
::
get
<
0
>
(
rng_engine_inputs
);
uint64_t
offset
=
std
::
get
<
1
>
(
rng_engine_inputs
);
hipStream_t
stream
=
at
::
hip
::
getCurrentHIPStreamMasqueradingAsCUDA
();
hipLaunchKernelGGL
((
fp32_to_bf16
),
dim3
(
blocks
),
dim3
(
threadsPerBlock
),
0
,
stream
,
(
const
float
*
)
input
.
data_ptr
(),
(
nv_bfloat16
*
)
output
.
data_ptr
(),
tsize
,
seed
,
offset
);
AT_CUDA_CHECK
(
hipGetLastError
());
}
Uni-Core-main/csrc/bak/rounding/interface.cpp
0 → 100644
View file @
a1c29028
#include <ATen/ATen.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>
void
fused_fp32_to_bf16_sr_cuda
(
at
::
Tensor
&
input
,
at
::
Tensor
&
output
);
#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
void
fused_fp32_to_bf16_sr
(
at
::
Tensor
&
input
,
at
::
Tensor
&
output
)
{
CHECK_INPUT
(
input
);
CHECK_INPUT
(
output
);
int64_t
num_elem
=
input
.
numel
();
AT_ASSERTM
(
output
.
numel
()
==
num_elem
,
"number of elements in input ond output tensors should be equal"
);
fused_fp32_to_bf16_sr_cuda
(
input
,
output
);
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"fp32_to_bf16_sr"
,
&
fused_fp32_to_bf16_sr
,
"fused fp32 to bf16 random rounding"
);
}
Prev
1
2
3
4
5
6
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment