Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
6fcb3818
Commit
6fcb3818
authored
Apr 02, 2022
by
Wangbo Zhao
Committed by
binmakeswell
Apr 06, 2022
Browse files
[NFC] polish colossalai/kernel/cuda_native/csrc/multi_tensor_l2norm_kernel.cu code style (#635)
parent
8a5d526e
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
324 additions
and
392 deletions
+324
-392
colossalai/kernel/cuda_native/csrc/multi_tensor_l2norm_kernel.cu
...lai/kernel/cuda_native/csrc/multi_tensor_l2norm_kernel.cu
+324
-392
No files found.
colossalai/kernel/cuda_native/csrc/multi_tensor_l2norm_kernel.cu
View file @
6fcb3818
// modified from https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_l2norm_kernel.cu
// modified from
// https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_l2norm_kernel.cu
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
...
...
@@ -9,37 +10,29 @@
#include <assert.h>
#include "type_shim.h"
#include "multi_tensor_apply.cuh"
#include "type_shim.h"
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
)
{
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
)
{
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
template
<
typename
T
>
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
)
{
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
__device__
__forceinline__
void
load_store
(
T
*
dst
,
T
*
src
,
int
dst_offset
,
int
src_offset
)
{
typedef
typename
std
::
aligned_storage
<
ILP
*
sizeof
(
T
),
ILP
*
alignof
(
T
)
>::
type
LT
;
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
x_t
>
struct
L2NormFunctor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
1
>
&
tl
,
float
*
output
,
float
*
output_per_tensor
,
bool
per_tensor
,
int
max_chunks_per_tensor
)
{
template
<
typename
x_t
>
struct
L2NormFunctor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
1
>
&
tl
,
float
*
output
,
float
*
output_per_tensor
,
bool
per_tensor
,
int
max_chunks_per_tensor
)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
...
...
@@ -55,39 +48,34 @@ struct L2NormFunctor
__shared__
float
s_vals
[
512
];
float
vals
[
ILP
];
// = {0}; // this probably works too but I want to be sure...
float
vals
[
ILP
];
// = {0}; // this probably works too but I want to be sure...
x_t
r_x
[
ILP
];
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
vals
[
i
]
=
0.
f
;
r_x
[
i
]
=
0
;
}
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
// load
load_store
(
r_x
,
x
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
next
=
static_cast
<
float
>
(
r_x
[
ii
]);
vals
[
ii
]
+=
next
*
next
;
}
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
if
(
i
<
n
&&
i
<
chunk_size
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
vals
[
ii
]
+=
next
*
next
;
}
...
...
@@ -101,30 +89,26 @@ struct L2NormFunctor
float
final
=
reduce_block_into_lanes
(
s_vals
,
val
);
if
(
threadIdx
.
x
==
0
)
{
if
(
threadIdx
.
x
==
0
)
{
if
(
!
isfinite
(
final
))
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
output
[
blockIdx
.
x
]
+=
final
;
if
(
per_tensor
)
output_per_tensor
[(
tl
.
start_tensor_this_launch
+
tensor_loc
)
*
max_chunks_per_tensor
+
chunk_idx
]
=
final
;
output_per_tensor
[(
tl
.
start_tensor_this_launch
+
tensor_loc
)
*
max_chunks_per_tensor
+
chunk_idx
]
=
final
;
}
}
};
// Probably better to template, but since we are not likely to support other norm
template
<
typename
x_t
>
struct
MaxNormFunctor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
1
>
&
tl
,
float
*
output
,
float
*
output_per_tensor
,
bool
per_tensor
,
int
max_chunks_per_tensor
)
{
// Probably better to template, but since we are not likely to support other
// norm
template
<
typename
x_t
>
struct
MaxNormFunctor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
1
>
&
tl
,
float
*
output
,
float
*
output_per_tensor
,
bool
per_tensor
,
int
max_chunks_per_tensor
)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
...
...
@@ -140,39 +124,34 @@ struct MaxNormFunctor
__shared__
float
s_vals
[
512
];
float
vals
[
ILP
];
// = {0}; // this probably works too but I want to be sure...
float
vals
[
ILP
];
// = {0}; // this probably works too but I want to be sure...
x_t
r_x
[
ILP
];
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
for
(
int
i
=
0
;
i
<
ILP
;
i
++
)
{
vals
[
i
]
=
0.
f
;
r_x
[
i
]
=
0
;
}
// to make things simple, we put aligned case in a different code path
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
if
(
n
%
ILP
==
0
&&
chunk_size
%
ILP
==
0
&&
is_aligned
(
x
))
{
for
(
int
i_start
=
threadIdx
.
x
;
i_start
*
ILP
<
n
&&
i_start
*
ILP
<
chunk_size
;
i_start
+=
blockDim
.
x
)
{
// load
load_store
(
r_x
,
x
,
0
,
i_start
);
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
float
next
=
static_cast
<
float
>
(
r_x
[
ii
]);
vals
[
ii
]
=
fmaxf
(
fabsf
(
vals
[
ii
]),
fabsf
(
next
));
}
}
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
}
else
{
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
{
if
(
i
<
n
&&
i
<
chunk_size
)
{
float
next
=
static_cast
<
float
>
(
x
[
i
]);
vals
[
ii
]
=
fmaxf
(
fabsf
(
vals
[
ii
]),
fabsf
(
next
));
}
...
...
@@ -186,29 +165,25 @@ struct MaxNormFunctor
float
final
=
reduce_block_into_lanes_max_op
(
s_vals
,
val
);
if
(
threadIdx
.
x
==
0
)
{
if
(
threadIdx
.
x
==
0
)
{
if
(
!
isfinite
(
final
))
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
*
noop_gmem
=
1
;
// Blindly fire off a write. These will race but that's ok.
output
[
blockIdx
.
x
]
=
fmaxf
(
fabsf
(
output
[
blockIdx
.
x
]),
fabsf
(
final
));
if
(
per_tensor
)
output_per_tensor
[(
tl
.
start_tensor_this_launch
+
tensor_loc
)
*
max_chunks_per_tensor
+
chunk_idx
]
=
final
;
output_per_tensor
[(
tl
.
start_tensor_this_launch
+
tensor_loc
)
*
max_chunks_per_tensor
+
chunk_idx
]
=
final
;
}
}
};
__global__
void
cleanup
(
float
*
output
,
float
*
output_per_tensor
,
float
*
ret
,
float
*
ret_per_tensor
,
bool
per_tensor
,
int
max_chunks_per_tensor
)
{
__global__
void
cleanup
(
float
*
output
,
float
*
output_per_tensor
,
float
*
ret
,
float
*
ret_per_tensor
,
bool
per_tensor
,
int
max_chunks_per_tensor
)
{
__shared__
float
vals
[
512
];
if
(
blockIdx
.
x
==
0
)
{
if
(
blockIdx
.
x
==
0
)
{
float
val
=
0
;
if
(
threadIdx
.
x
<
320
)
val
=
output
[
threadIdx
.
x
];
...
...
@@ -219,9 +194,9 @@ __global__ void cleanup(
*
ret
=
sqrt
(
final
);
}
if
(
per_tensor
)
{
float
*
output_this_tensor
=
output_per_tensor
+
blockIdx
.
x
*
max_chunks_per_tensor
;
if
(
per_tensor
)
{
float
*
output_this_tensor
=
output_per_tensor
+
blockIdx
.
x
*
max_chunks_per_tensor
;
float
val
=
0
;
for
(
int
i
=
threadIdx
.
x
;
i
<
max_chunks_per_tensor
;
i
+=
blockDim
.
x
)
...
...
@@ -234,45 +209,33 @@ __global__ void cleanup(
}
}
__global__
void
cleanup_v2
(
float
*
output
,
float
*
output_per_tensor
,
float
*
ret
,
float
*
ret_per_tensor
,
bool
per_tensor
,
int
max_chunks_per_tensor
,
int
norm_type
,
float
alpha
,
float
beta
)
{
__global__
void
cleanup_v2
(
float
*
output
,
float
*
output_per_tensor
,
float
*
ret
,
float
*
ret_per_tensor
,
bool
per_tensor
,
int
max_chunks_per_tensor
,
int
norm_type
,
float
alpha
,
float
beta
)
{
__shared__
float
vals
[
512
];
if
(
blockIdx
.
x
==
0
)
{
if
(
blockIdx
.
x
==
0
)
{
float
val
=
0
;
if
(
threadIdx
.
x
<
320
)
val
=
output
[
threadIdx
.
x
];
if
(
norm_type
==
0
)
{
if
(
norm_type
==
0
)
{
float
final
=
reduce_block_into_lanes_max_op
(
vals
,
val
);
if
(
threadIdx
.
x
==
0
)
*
ret
=
alpha
*
(
*
ret
)
+
beta
*
final
;
}
else
{
}
else
{
float
final
=
reduce_block_into_lanes
(
vals
,
val
);
if
(
threadIdx
.
x
==
0
)
*
ret
=
sqrt
(
alpha
*
(
*
ret
)
*
(
*
ret
)
+
beta
*
final
);
}
}
if
(
per_tensor
)
{
float
*
output_this_tensor
=
output_per_tensor
+
blockIdx
.
x
*
max_chunks_per_tensor
;
if
(
per_tensor
)
{
float
*
output_this_tensor
=
output_per_tensor
+
blockIdx
.
x
*
max_chunks_per_tensor
;
if
(
norm_type
==
0
)
{
if
(
norm_type
==
0
)
{
float
val
=
0
;
for
(
int
i
=
threadIdx
.
x
;
i
<
max_chunks_per_tensor
;
i
+=
blockDim
.
x
)
val
=
fmaxf
(
fabsf
(
val
),
fabsf
(
output_this_tensor
[
i
]));
...
...
@@ -280,10 +243,9 @@ __global__ void cleanup_v2(
float
final
=
reduce_block_into_lanes_max_op
(
vals
,
val
);
if
(
threadIdx
.
x
==
0
)
ret_per_tensor
[
blockIdx
.
x
]
=
alpha
*
ret_per_tensor
[
blockIdx
.
x
]
+
beta
*
final
;
}
else
{
ret_per_tensor
[
blockIdx
.
x
]
=
alpha
*
ret_per_tensor
[
blockIdx
.
x
]
+
beta
*
final
;
}
else
{
float
val
=
0
;
for
(
int
i
=
threadIdx
.
x
;
i
<
max_chunks_per_tensor
;
i
+=
blockDim
.
x
)
val
+=
output_this_tensor
[
i
];
...
...
@@ -291,18 +253,19 @@ __global__ void cleanup_v2(
float
final
=
reduce_block_into_lanes
(
vals
,
val
);
if
(
threadIdx
.
x
==
0
)
ret_per_tensor
[
blockIdx
.
x
]
=
sqrt
(
alpha
*
ret_per_tensor
[
blockIdx
.
x
]
*
ret_per_tensor
[
blockIdx
.
x
]
+
beta
*
final
);
ret_per_tensor
[
blockIdx
.
x
]
=
sqrt
(
alpha
*
ret_per_tensor
[
blockIdx
.
x
]
*
ret_per_tensor
[
blockIdx
.
x
]
+
beta
*
final
);
}
}
}
std
::
tuple
<
at
::
Tensor
,
at
::
Tensor
>
multi_tensor_l2norm_cuda
(
int
chunk_size
,
at
::
Tensor
noop_flag
,
std
::
tuple
<
at
::
Tensor
,
at
::
Tensor
>
multi_tensor_l2norm_cuda
(
int
chunk_size
,
at
::
Tensor
noop_flag
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
,
at
::
optional
<
bool
>
per_tensor_python
)
{
bool
per_tensor
=
per_tensor_python
.
has_value
()
?
per_tensor_python
.
value
()
:
false
;
at
::
optional
<
bool
>
per_tensor_python
)
{
bool
per_tensor
=
per_tensor_python
.
has_value
()
?
per_tensor_python
.
value
()
:
false
;
auto
float_options
=
tensor_lists
[
0
][
0
].
options
().
dtype
(
at
::
kFloat
);
auto
output
=
at
::
zeros
({
320
},
float_options
);
...
...
@@ -313,40 +276,34 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
int
ntensors
=
tensor_lists
[
0
].
size
();
int
max_chunks_per_tensor
=
-
1
;
if
(
per_tensor
)
{
for
(
int
t
=
0
;
t
<
ntensors
;
t
++
)
{
int
max_chunks_this_tensor
=
(
tensor_lists
[
0
][
t
].
numel
()
+
chunk_size
-
1
)
/
chunk_size
;
if
(
per_tensor
)
{
for
(
int
t
=
0
;
t
<
ntensors
;
t
++
)
{
int
max_chunks_this_tensor
=
(
tensor_lists
[
0
][
t
].
numel
()
+
chunk_size
-
1
)
/
chunk_size
;
if
(
max_chunks_this_tensor
>
max_chunks_per_tensor
)
max_chunks_per_tensor
=
max_chunks_this_tensor
;
}
output_per_tensor
=
at
::
zeros
({
ntensors
*
max_chunks_per_tensor
},
float_options
);
output_per_tensor
=
at
::
zeros
({
ntensors
*
max_chunks_per_tensor
},
float_options
);
ret_per_tensor
=
at
::
empty
({
ntensors
},
float_options
);
}
else
{
}
else
{
ret_per_tensor
=
at
::
empty
({
0
},
float_options
);
}
DISPATCH_FLOAT_AND_HALF
(
tensor_lists
[
0
][
0
].
scalar_type
(),
0
,
"multi_tensor_l2norm_cuda"
,
DISPATCH_FLOAT_AND_HALF
(
tensor_lists
[
0
][
0
].
scalar_type
(),
0
,
"multi_tensor_l2norm_cuda"
,
multi_tensor_apply
<
1
>
(
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
tensor_lists
,
L2NormFunctor
<
scalar_t_0
>
(),
output
.
DATA_PTR
<
float
>
(),
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
tensor_lists
,
L2NormFunctor
<
scalar_t_0
>
(),
output
.
DATA_PTR
<
float
>
(),
per_tensor
?
output_per_tensor
.
DATA_PTR
<
float
>
()
:
nullptr
,
per_tensor
,
max_chunks_per_tensor
);)
per_tensor
,
max_chunks_per_tensor
);)
AT_CUDA_CHECK
(
cudaGetLastError
());
// AT_CUDA_CHECK(cudaDeviceSynchronize());
// This involves one more small kernel launches, but will be negligible end to
end.
// I could get rid of these by hacking the functor + multi tensor harness
with persistence
// logic, but keeping it simple for now
// This involves one more small kernel launches, but will be negligible end to
//
end.
I could get rid of these by hacking the functor + multi tensor harness
//
with persistence
logic, but keeping it simple for now
auto
ret
=
at
::
empty
({
1
},
output
.
options
());
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
output
));
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
...
...
@@ -354,8 +311,7 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
output
.
DATA_PTR
<
float
>
(),
per_tensor
?
output_per_tensor
.
DATA_PTR
<
float
>
()
:
nullptr
,
ret
.
DATA_PTR
<
float
>
(),
per_tensor
?
ret_per_tensor
.
DATA_PTR
<
float
>
()
:
nullptr
,
per_tensor
,
per_tensor
?
ret_per_tensor
.
DATA_PTR
<
float
>
()
:
nullptr
,
per_tensor
,
max_chunks_per_tensor
);
return
std
::
tuple
<
at
::
Tensor
,
at
::
Tensor
>
(
ret
,
ret_per_tensor
);
...
...
@@ -366,16 +322,12 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
// L-2: gn = sqrt(a * gn^2 + b * n^2)
// L-inf: gn = a * gn + b * n
void
multi_tensor_norm_out_cuda
(
int
chunk_size
,
at
::
Tensor
noop_flag
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
,
at
::
Tensor
out
,
const
float
alpha
,
const
float
beta
,
const
int
norm_type
)
{
int
chunk_size
,
at
::
Tensor
noop_flag
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
,
at
::
Tensor
out
,
const
float
alpha
,
const
float
beta
,
const
int
norm_type
)
{
auto
float_options
=
tensor_lists
[
0
][
0
].
options
().
dtype
(
at
::
kFloat
);
TORCH_CHECK
(
tensor_lists
[
0
][
0
].
device
()
==
noop_flag
.
device
(),
"noop flag should be on the same device as tensors"
);
TORCH_CHECK
(
tensor_lists
[
0
][
0
].
device
()
==
noop_flag
.
device
(),
"noop flag should be on the same device as tensors"
);
// we don't need global thus uses empty here
auto
output
=
at
::
empty
({
320
},
float_options
);
...
...
@@ -385,54 +337,40 @@ void multi_tensor_norm_out_cuda(
int
ntensors
=
tensor_lists
[
0
].
size
();
int
max_chunks_per_tensor
=
-
1
;
for
(
int
t
=
0
;
t
<
ntensors
;
t
++
)
{
int
max_chunks_this_tensor
=
(
tensor_lists
[
0
][
t
].
numel
()
+
chunk_size
-
1
)
/
chunk_size
;
for
(
int
t
=
0
;
t
<
ntensors
;
t
++
)
{
int
max_chunks_this_tensor
=
(
tensor_lists
[
0
][
t
].
numel
()
+
chunk_size
-
1
)
/
chunk_size
;
if
(
max_chunks_this_tensor
>
max_chunks_per_tensor
)
max_chunks_per_tensor
=
max_chunks_this_tensor
;
}
// Although it is single write then read, still need to be zero
// Since tailing element also participate cleanup
output_per_tensor
=
at
::
zeros
({
ntensors
*
max_chunks_per_tensor
},
float_options
);
output_per_tensor
=
at
::
zeros
({
ntensors
*
max_chunks_per_tensor
},
float_options
);
if
(
norm_type
==
0
)
{
if
(
norm_type
==
0
)
{
DISPATCH_FLOAT_AND_HALF
(
tensor_lists
[
0
][
0
].
scalar_type
(),
0
,
"multi_tensor_maxnorm_cuda"
,
multi_tensor_apply
<
1
>
(
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
tensor_lists
,
MaxNormFunctor
<
scalar_t_0
>
(),
output
.
DATA_PTR
<
float
>
(),
output_per_tensor
.
DATA_PTR
<
float
>
(),
true
,
max_chunks_per_tensor
);)
}
else
{
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
tensor_lists
,
MaxNormFunctor
<
scalar_t_0
>
(),
output
.
DATA_PTR
<
float
>
(),
output_per_tensor
.
DATA_PTR
<
float
>
(),
true
,
max_chunks_per_tensor
);)
}
else
{
DISPATCH_FLOAT_AND_HALF
(
tensor_lists
[
0
][
0
].
scalar_type
(),
0
,
"multi_tensor_l2norm_cuda"
,
multi_tensor_apply
<
1
>
(
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
tensor_lists
,
L2NormFunctor
<
scalar_t_0
>
(),
output
.
DATA_PTR
<
float
>
(),
output_per_tensor
.
DATA_PTR
<
float
>
(),
true
,
max_chunks_per_tensor
);)
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
tensor_lists
,
L2NormFunctor
<
scalar_t_0
>
(),
output
.
DATA_PTR
<
float
>
(),
output_per_tensor
.
DATA_PTR
<
float
>
(),
true
,
max_chunks_per_tensor
);)
}
AT_CUDA_CHECK
(
cudaGetLastError
());
// AT_CUDA_CHECK(cudaDeviceSynchronize());
// This involves one more small kernel launches, but will be negligible end to
end.
// I could get rid of these by hacking the functor + multi tensor harness
with persistence
// logic, but keeping it simple for now
// This involves one more small kernel launches, but will be negligible end to
//
end.
I could get rid of these by hacking the functor + multi tensor harness
//
with persistence
logic, but keeping it simple for now
auto
ret
=
at
::
empty
({
1
},
output
.
options
());
// Adding the following device guard since it happens sometimes that the
...
...
@@ -441,15 +379,9 @@ void multi_tensor_norm_out_cuda(
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
output
));
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cleanup_v2
<<<
ntensors
,
512
,
0
,
stream
>>>
(
output
.
DATA_PTR
<
float
>
(),
output_per_tensor
.
DATA_PTR
<
float
>
(),
ret
.
DATA_PTR
<
float
>
(),
out
.
DATA_PTR
<
float
>
(),
true
,
max_chunks_per_tensor
,
norm_type
,
alpha
,
beta
);
output
.
DATA_PTR
<
float
>
(),
output_per_tensor
.
DATA_PTR
<
float
>
(),
ret
.
DATA_PTR
<
float
>
(),
out
.
DATA_PTR
<
float
>
(),
true
,
max_chunks_per_tensor
,
norm_type
,
alpha
,
beta
);
return
;
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment