Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
b67eebd2
Commit
b67eebd2
authored
May 16, 2022
by
Jie Zhu
Committed by
binmakeswell
May 17, 2022
Browse files
[NFC] polish colossalai/kernel/cuda_native/csrc/multi_tensor_scale_kernel.cu code style (#977)
parent
52705ec5
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
7 additions
and
7 deletions
+7
-7
colossalai/kernel/cuda_native/csrc/multi_tensor_scale_kernel.cu
...alai/kernel/cuda_native/csrc/multi_tensor_scale_kernel.cu
+7
-7
No files found.
colossalai/kernel/cuda_native/csrc/multi_tensor_scale_kernel.cu
View file @
b67eebd2
...
...
@@ -15,7 +15,8 @@
#define BLOCK_SIZE 512
#define ILP 4
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
)
{
template
<
typename
T
>
__device__
__forceinline__
bool
is_aligned
(
T
*
p
)
{
return
((
uint64_t
)
p
)
%
(
ILP
*
sizeof
(
T
))
==
0
;
}
...
...
@@ -27,7 +28,8 @@ __device__ __forceinline__ void load_store(T *dst, T *src, int dst_offset,
((
LT
*
)
dst
)[
dst_offset
]
=
((
LT
*
)
src
)[
src_offset
];
}
template
<
typename
in_t
,
typename
out_t
>
struct
ScaleFunctor
{
template
<
typename
in_t
,
typename
out_t
>
struct
ScaleFunctor
{
__device__
__forceinline__
void
operator
()(
int
chunk_size
,
volatile
int
*
noop_gmem
,
TensorListMetadata
<
2
>
&
tl
,
...
...
@@ -76,8 +78,7 @@ template <typename in_t, typename out_t> struct ScaleFunctor {
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
r_in
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
r_in
[
ii
]
=
in
[
i
];
if
(
i
<
n
&&
i
<
chunk_size
)
r_in
[
ii
]
=
in
[
i
];
}
// note for clarification to future michael:
// From a pure memory dependency perspective, there's likely no point
...
...
@@ -93,8 +94,7 @@ template <typename in_t, typename out_t> struct ScaleFunctor {
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
&&
i
<
chunk_size
)
out
[
i
]
=
r_out
[
ii
];
if
(
i
<
n
&&
i
<
chunk_size
)
out
[
i
]
=
r_out
[
ii
];
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment