Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
adce1006
"vscode:/vscode.git/clone" did not exist on "cc0b9a7a134ff562c964a584f18dec8044444e0f"
Commit
adce1006
authored
Dec 01, 2023
by
Astha Rai
Browse files
cleaned up errors, randomized input tensor, added more instances
parent
d68df255
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
19 additions
and
22 deletions
+19
-22
example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
...4_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
+6
-5
example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
...4_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
+3
-1
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.cpp
...ance/gpu/permute_scale/device_permute_scale_instances.cpp
+2
-2
profiler/include/profiler/profile_permute_scale_impl.hpp
profiler/include/profiler/profile_permute_scale_impl.hpp
+6
-12
test/permute_scale/test_permute_scale.cpp
test/permute_scale/test_permute_scale.cpp
+2
-2
No files found.
example/44_elementwise_permute/elementwise_permute_4D_fp16_col.cpp
View file @
adce1006
#include <iostream>
#include <cstdlib>
#include <random>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
...
...
@@ -48,10 +49,8 @@ void host_elementwise4D(HostTensorB& B_nhwc,
for
(
std
::
size_t
n
=
0
;
n
<
N
;
++
n
)
{
ADataType
tmp_val
;
// auto a_val = A_nchw(n, c, h, w);
auto
a_val
=
A_nchw
.
mData
[(
n
)
+
(
c
*
N
)
+
(
h
*
C
*
N
)
+
(
w
*
H
*
C
*
N
)];
functor_b
(
tmp_val
,
a_val
);
// functor_a(B_nhwc(n, h, w, c), scale * tmp_val);
functor_a
(
B_nhwc
.
mData
[(
n
)
+
(
c
*
W
*
H
*
N
)
+
(
h
*
N
)
+
(
w
*
H
*
N
)],
scale
*
tmp_val
);
}
...
...
@@ -62,12 +61,14 @@ int main()
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
std
::
vector
<
std
::
size_t
>
nchw
=
{
4
,
2
,
1
,
8
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
4
,
1
,
8
,
2
};
std
::
vector
<
std
::
size_t
>
nchw
=
{
16
,
8
,
32
,
64
};
std
::
vector
<
std
::
size_t
>
nhwc
=
{
16
,
32
,
64
,
8
};
Tensor
<
ADataType
>
a
(
nchw
);
Tensor
<
BDataType
>
b
(
nhwc
);
float
scale
=
1.
f
;
auto
i
=
0
;
std
::
mt19937
gen
(
11939
);
std
::
uniform_int_distribution
<
int
>
dis
(
0
,
1
);
for
(
std
::
size_t
w
=
0
;
w
<
a
.
mDesc
.
GetLengths
()[
3
];
++
w
)
for
(
std
::
size_t
h
=
0
;
h
<
a
.
mDesc
.
GetLengths
()[
2
];
++
h
)
for
(
std
::
size_t
c
=
0
;
c
<
a
.
mDesc
.
GetLengths
()[
1
];
++
c
)
...
...
@@ -75,7 +76,7 @@ int main()
{
a
.
mData
[(
n
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
(
c
*
nchw
[
2
]
*
nchw
[
3
])
+
(
h
*
nchw
[
3
])
+
w
]
=
i
;
i
++
;
i
=
dis
(
gen
)
;
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
...
...
example/44_elementwise_permute/elementwise_permute_4D_fp32_col.cpp
View file @
adce1006
...
...
@@ -67,6 +67,8 @@ int main()
float
scale
=
1.
f
;
auto
i
=
0
;
std
::
mt19937
gen
(
11939
);
std
::
uniform_int_distribution
<
int
>
dis
(
0
,
1
);
for
(
std
::
size_t
w
=
0
;
w
<
a
.
mDesc
.
GetLengths
()[
3
];
++
w
)
for
(
std
::
size_t
h
=
0
;
h
<
a
.
mDesc
.
GetLengths
()[
2
];
++
h
)
for
(
std
::
size_t
c
=
0
;
c
<
a
.
mDesc
.
GetLengths
()[
1
];
++
c
)
...
...
@@ -74,7 +76,7 @@ int main()
{
a
.
mData
[(
n
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
(
c
*
nchw
[
2
]
*
nchw
[
3
])
+
(
h
*
nchw
[
3
])
+
w
]
=
i
;
i
++
;
i
=
dis
(
gen
)
;
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
...
...
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.cpp
View file @
adce1006
...
...
@@ -24,14 +24,14 @@ using device_permute_scale_f16_instances =
std
::
tuple
<
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
4
,
1
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
4
,
8
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
4
,
8
,
ck
::
Sequence
<
8
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
4
,
8
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
8
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
4
,
4
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
4
,
2
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
>
;
using
device_permute_scale_f32_instances
=
std
::
tuple
<
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
4
,
1
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
4
,
8
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
4
,
4
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
,
DeviceElementwiseImpl
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
4
,
2
,
ck
::
Sequence
<
1
>
,
ck
::
Sequence
<
1
>>
>
;
// clang-format on
...
...
profiler/include/profiler/profile_permute_scale_impl.hpp
View file @
adce1006
...
...
@@ -4,6 +4,7 @@
#pragma once
#include <iomanip>
#include <random>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
...
...
@@ -18,7 +19,6 @@
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace
ck
{
namespace
profiler
{
...
...
@@ -94,6 +94,8 @@ bool profile_permute_scale_impl(int do_verification,
case
0
:
break
;
case
1
:
a
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
1
,
2
});
break
;
default:
// a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}
std
::
mt19937
gen
(
11939
);
std
::
uniform_int_distribution
<
int
>
dis
(
0
,
1
);
auto
i
=
0
;
for
(
std
::
size_t
w
=
0
;
w
<
a
.
mDesc
.
GetLengths
()[
3
];
++
w
)
for
(
std
::
size_t
h
=
0
;
h
<
a
.
mDesc
.
GetLengths
()[
2
];
++
h
)
...
...
@@ -102,7 +104,7 @@ bool profile_permute_scale_impl(int do_verification,
{
a
.
mData
[(
n
*
nchw
[
1
]
*
nchw
[
2
]
*
nchw
[
3
])
+
(
c
*
nchw
[
2
]
*
nchw
[
3
])
+
(
h
*
nchw
[
3
])
+
w
]
=
i
;
i
++
;
i
=
dis
(
gen
)
;
}
}
...
...
@@ -136,8 +138,6 @@ bool profile_permute_scale_impl(int do_verification,
host_elementwise4D
(
host_b
,
a
,
ElementOp
{},
UnaryOp
{},
scale
);
}
int
num_kernel
=
0
;
for
(
auto
&
op_ptr
:
op_ptrs
)
{
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
ab_lengths
,
...
...
@@ -207,14 +207,8 @@ bool profile_permute_scale_impl(int do_verification,
if
(
time_kernel
)
{
LogRange
(
std
::
cout
<<
"length = "
,
lengths
,
","
)
<<
", "
;
std
::
cout
<<
"num_kernel = "
<<
num_kernel
<<
", best perf = "
<<
best_ave_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
}
if
(
num_kernel
==
1
)
{
std
::
cout
<<
"Error: No kernel is tested"
<<
std
::
endl
;
return
false
;
std
::
cout
<<
"best perf = "
<<
best_ave_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
}
return
true
;
...
...
test/permute_scale/test_permute_scale.cpp
View file @
adce1006
...
...
@@ -18,12 +18,12 @@ class TestPermute : public ::testing::Test
void
Run
()
{
std
::
vector
<
std
::
vector
<
ck
::
index_t
>>
lengths
=
{
{
4
,
2
,
1
,
8
},
{
1
,
1
,
1
,
1
},
{
16
,
8
,
32
,
8
}};
{
4
,
2
,
1
,
8
},
{
1
,
1
,
1
,
1
},
{
16
,
8
,
32
,
64
},
{
32
,
64
,
128
,
12
8
}};
for
(
auto
length
:
lengths
)
{
bool
success
=
ck
::
profiler
::
profile_permute_scale_impl
<
ADataType
,
BDataType
,
4
>
(
true
,
2
,
false
,
fals
e
,
length
);
true
,
2
,
false
,
tru
e
,
length
);
EXPECT_TRUE
(
success
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment