Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
9608beee
Unverified
Commit
9608beee
authored
Nov 06, 2022
by
arai713
Committed by
GitHub
Nov 06, 2022
Browse files
Merge branch 'develop' into gridwise_2d
parents
d179a12a
8a4253ba
Changes
172
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
632 additions
and
176 deletions
+632
-176
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
...u/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
+27
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
...u/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
+27
-0
profiler/include/profile_elementwise_layernorm_impl.hpp
profiler/include/profile_elementwise_layernorm_impl.hpp
+264
-0
profiler/include/profile_groupnorm_impl.hpp
profiler/include/profile_groupnorm_impl.hpp
+3
-1
profiler/include/profile_layernorm_impl.hpp
profiler/include/profile_layernorm_impl.hpp
+38
-15
profiler/include/profile_softmax_impl.hpp
profiler/include/profile_softmax_impl.hpp
+79
-123
profiler/src/profile_softmax.cpp
profiler/src/profile_softmax.cpp
+18
-25
profiler/src/profiler.cpp
profiler/src/profiler.cpp
+4
-3
script/cmake-ck-dev.sh
script/cmake-ck-dev.sh
+1
-1
script/cmake-ck-release.sh
script/cmake-ck-release.sh
+1
-1
test/CMakeLists.txt
test/CMakeLists.txt
+2
-1
test/elementwise_normalization/CMakeLists.txt
test/elementwise_normalization/CMakeLists.txt
+7
-0
test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
...entwise_normalization/test_elementwise_layernorm_fp16.cpp
+47
-0
test/normalization/CMakeLists.txt
test/normalization/CMakeLists.txt
+4
-4
test/normalization/test_groupnorm_fp16.cpp
test/normalization/test_groupnorm_fp16.cpp
+1
-1
test/normalization/test_groupnorm_fp32.cpp
test/normalization/test_groupnorm_fp32.cpp
+1
-1
No files found.
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
0 → 100644
View file @
9608beee
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
0 → 100644
View file @
9608beee
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
3
;
void
add_device_softmax_i8_i8_rank3_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
0 → 100644
View file @
9608beee
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce1_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
0 → 100644
View file @
9608beee
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce2_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
2
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
0 → 100644
View file @
9608beee
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
0 → 100644
View file @
9608beee
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
index_t
RANK
=
4
;
void
add_device_softmax_i8_i8_rank4_reduce4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
I8
,
F32
,
I8
,
PassThrough
,
PassThrough
,
RANK
>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_softmax_i8_i8_instances
<
RANK
,
4
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
profiler/include/profile_elementwise_layernorm_impl.hpp
0 → 100644
View file @
9608beee
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
HostTensorC
,
typename
Functor
>
void
host_elementwise2D
(
HostTensorC
&
C
,
const
HostTensorA
&
A
,
const
HostTensorB
&
B
,
const
std
::
vector
<
std
::
size_t
>&
shape
,
Functor
functor
)
{
using
ctype
=
ck
::
remove_reference_t
<
decltype
(
C
(
0
,
0
))
>
;
for
(
std
::
size_t
m
=
0
;
m
<
shape
[
0
];
++
m
)
for
(
std
::
size_t
n
=
0
;
n
<
shape
[
1
];
++
n
)
{
auto
a_val
=
A
(
m
,
n
);
auto
b_val
=
B
(
m
,
n
);
ctype
c_val
=
0
;
functor
(
c_val
,
a_val
,
b_val
);
C
(
m
,
n
)
=
c_val
;
}
}
template
<
typename
ADataType
,
typename
BDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
AccDataType
,
typename
YDataType
>
bool
profile_elementwise_layernorm_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
std
::
vector
<
index_t
>
length
)
{
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
if
(
length
.
size
()
!=
2
)
return
false
;
index_t
M
=
length
[
0
];
index_t
N
=
length
[
1
];
index_t
Stride
=
N
;
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
std
::
vector
<
index_t
>
reduce_dim
=
{
1
};
std
::
vector
<
index_t
>
gammaBetaLength
=
{
N
};
std
::
vector
<
index_t
>
gammaBetaStride
=
{
0
,
1
};
auto
f_host_tensor_descriptor2d
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
};
Tensor
<
ADataType
>
a
(
length
);
Tensor
<
BDataType
>
b
(
length
);
Tensor
<
GammaDataType
>
gamma
(
gammaBetaLength
);
Tensor
<
BetaDataType
>
beta
(
gammaBetaLength
);
Tensor
<
YDataType
>
y
(
length
);
Tensor
<
YDataType
>
host_y
(
length
);
switch
(
init_method
)
{
case
0
:
a
.
GenerateTensorValue
(
GeneratorTensor_1
<
ADataType
>
{});
b
.
GenerateTensorValue
(
GeneratorTensor_1
<
BDataType
>
{});
gamma
.
GenerateTensorValue
(
GeneratorTensor_1
<
GammaDataType
>
{});
beta
.
GenerateTensorValue
(
GeneratorTensor_1
<
BetaDataType
>
{});
break
;
case
1
:
a
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
gamma
.
GenerateTensorValue
(
GeneratorTensor_2
<
GammaDataType
>
{
-
5
,
5
});
beta
.
GenerateTensorValue
(
GeneratorTensor_2
<
BetaDataType
>
{
-
5
,
5
});
break
;
default:
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0
,
1
});
b
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
0
,
1
});
gamma
.
GenerateTensorValue
(
GeneratorTensor_3
<
GammaDataType
>
{
-
0.5
,
0.5
});
beta
.
GenerateTensorValue
(
GeneratorTensor_3
<
BetaDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
a_dev
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_dev
(
sizeof
(
ADataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
gamma_dev
(
sizeof
(
GammaDataType
)
*
gamma
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
beta_dev
(
sizeof
(
BetaDataType
)
*
beta
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_dev
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
a_dev
.
ToDevice
(
a
.
mData
.
data
());
b_dev
.
ToDevice
(
b
.
mData
.
data
());
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
std
::
array
<
const
void
*
,
2
>
input
=
{
a_dev
.
GetDeviceBuffer
(),
b_dev
.
GetDeviceBuffer
()};
// add device normalization instances
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseNormalization
<
ck
::
Tuple
<
ADataType
,
BDataType
>
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
Add
,
PassThrough
,
2
,
1
>
;
// get device op instances
const
auto
instance_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
if
(
do_verification
)
{
using
XDataType
=
ADataType
;
std
::
vector
<
std
::
size_t
>
mn
=
{
static_cast
<
unsigned
long
>
(
M
),
static_cast
<
unsigned
long
>
(
N
)};
Tensor
<
XDataType
>
x
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
host_elementwise2D
<
Tensor
<
ADataType
>
,
Tensor
<
BDataType
>
,
Tensor
<
XDataType
>
,
Add
>
(
x
,
a
,
b
,
mn
,
Add
{});
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceLayernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
PassThrough
{},
{
M
,
N
},
{
1
},
1e-4
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
}
int
num_kernel
=
0
;
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
{
std
::
vector
<
ck
::
index_t
>
{
a
.
mDesc
.
GetStrides
().
begin
(),
a
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
b
.
mDesc
.
GetStrides
().
begin
(),
b
.
mDesc
.
GetStrides
().
end
()},
},
gammaBetaStride
,
gammaBetaStride
,
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
().
begin
(),
y
.
mDesc
.
GetStrides
().
end
()},
reduce_dim
,
1e-4
,
input
,
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
Add
{},
PassThrough
{});
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
++
num_kernel
;
}
else
{
continue
;
}
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes
=
a
.
mDesc
.
GetElementSize
()
*
sizeof
(
ADataType
)
+
b
.
mDesc
.
GetElementSize
()
*
sizeof
(
BDataType
)
+
gamma
.
mDesc
.
GetElementSize
()
*
sizeof
(
GammaDataType
)
+
beta
.
mDesc
.
GetElementSize
()
*
sizeof
(
BetaDataType
)
+
y
.
mDesc
.
GetElementSize
()
*
sizeof
(
YDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_instance_name
=
inst_ptr
->
GetTypeString
();
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
y_dev
.
FromDevice
(
y
.
mData
.
data
());
bool
pass
=
ck
::
utils
::
check_err
(
y
.
mData
,
host_y
.
mData
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a : "
,
a
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b : "
,
b
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"host_y : "
,
host_y
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"y : "
,
y
.
mData
,
","
)
<<
std
::
endl
;
}
if
(
!
pass
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" failed verification: "
;
LogRange
(
std
::
cout
<<
"lengths = ["
,
length
,
", "
)
<<
"]."
<<
std
::
endl
;
return
false
;
}
else
{
if
(
time_kernel
)
std
::
cout
<<
"pass"
<<
std
::
endl
;
}
}
}
if
(
time_kernel
)
{
LogRange
(
std
::
cout
<<
"length = "
,
length
,
","
)
<<
", "
;
std
::
cout
<<
"num_kernel = "
<<
num_kernel
<<
", best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
}
if
(
num_kernel
==
0
)
{
std
::
cout
<<
"Error: No kernel is tested"
<<
std
::
endl
;
return
false
;
}
return
true
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profile_groupnorm_impl.hpp
View file @
9608beee
...
...
@@ -126,6 +126,8 @@ bool profile_groupnorm_impl(int do_verification,
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
nullptr
,
nullptr
,
PassThrough
{});
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
...
...
@@ -196,7 +198,7 @@ bool profile_groupnorm_impl(int do_verification,
if
(
num_kernel
==
0
)
{
std
::
cout
<<
"Error: No kernel is
tested
"
<<
std
::
endl
;
std
::
cout
<<
"Error: No kernel is
applicable
"
<<
std
::
endl
;
return
false
;
}
...
...
profiler/include/profile_layernorm_impl.hpp
View file @
9608beee
...
...
@@ -22,7 +22,7 @@ template <typename XDataType,
typename
AccDataType
,
typename
YDataType
,
index_t
Rank
>
void
profile_layernorm_impl
(
int
do_verification
,
bool
profile_layernorm_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
...
...
@@ -31,7 +31,7 @@ void profile_layernorm_impl(int do_verification,
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
if
(
length
.
size
()
<
2
)
return
;
return
false
;
// Assume normalize dimension except for batch (first) dimension
std
::
vector
<
index_t
>
reduce_length
{
length
.
begin
()
+
1
,
length
.
end
()};
...
...
@@ -52,7 +52,6 @@ void profile_layernorm_impl(int do_verification,
switch
(
init_method
)
{
// case 0: break;
case
0
:
x
.
GenerateTensorValue
(
GeneratorTensor_1
<
XDataType
>
{});
gamma
.
GenerateTensorValue
(
GeneratorTensor_1
<
GammaDataType
>
{});
...
...
@@ -122,6 +121,8 @@ void profile_layernorm_impl(int do_verification,
ref_invoker
.
Run
(
ref_argument
);
}
int
num_kernel
=
0
;
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
...
...
@@ -135,12 +136,21 @@ void profile_layernorm_impl(int do_verification,
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
nullptr
,
nullptr
,
PassThrough
{});
if
(
!
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
++
num_kernel
;
}
else
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
;
LogRange
(
std
::
cout
<<
"input lengths = "
,
length
,
", "
)
<<
std
::
endl
;
if
(
time_kernel
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
;
LogRange
(
std
::
cout
<<
"input lengths = "
,
length
,
", "
)
<<
std
::
endl
;
}
continue
;
}
...
...
@@ -156,8 +166,9 @@ void profile_layernorm_impl(int do_verification,
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
...
...
@@ -184,20 +195,32 @@ void profile_layernorm_impl(int do_verification,
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" failed verification: "
;
LogRange
(
std
::
cout
<<
"lengths = ["
,
length
,
", "
)
<<
"]."
<<
std
::
endl
;
return
;
return
false
;
}
else
{
std
::
cout
<<
"pass"
<<
std
::
endl
;
if
(
time_kernel
)
std
::
cout
<<
"pass"
<<
std
::
endl
;
}
}
}
LogRange
(
std
::
cout
<<
"length = "
,
length
,
","
)
<<
", "
;
LogRange
(
std
::
cout
<<
"stride = "
,
strideXY
,
","
)
<<
", "
;
LogRange
(
std
::
cout
<<
"reduce dims "
,
reduce_dim
,
","
)
<<
std
::
endl
;
std
::
cout
<<
"best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
if
(
time_kernel
)
{
LogRange
(
std
::
cout
<<
"length = "
,
length
,
","
)
<<
", "
;
LogRange
(
std
::
cout
<<
"stride = "
,
strideXY
,
","
)
<<
", "
;
LogRange
(
std
::
cout
<<
"reduce dims "
,
reduce_dim
,
","
)
<<
std
::
endl
;
std
::
cout
<<
"best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
}
if
(
num_kernel
==
0
)
{
std
::
cout
<<
"Error: No kernel is applicable"
<<
std
::
endl
;
return
false
;
}
return
true
;
}
}
// namespace profiler
...
...
profiler/include/profile_softmax_impl.hpp
View file @
9608beee
...
...
@@ -3,55 +3,27 @@
#pragma once
#include <algorithm>
#include <iomanip>
#include <iostream>
#include <string>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/data_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
namespace
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
}
// namespace
void
add_device_softmax_f16_f16_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
3
>>&
);
void
add_device_softmax_f16_f16_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F16
,
F32
,
F16
,
PassThrough
,
PassThrough
,
4
>>&
);
void
add_device_softmax_f32_f32_rank3_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
3
>>&
);
void
add_device_softmax_f32_f32_rank4_instances
(
std
::
vector
<
DeviceSoftmaxPtr
<
F32
,
F32
,
F32
,
PassThrough
,
PassThrough
,
4
>>&
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
ck
{
namespace
profiler
{
enum
struct
NormType
{
BATCHNORM
,
SOFTMAX
,
};
enum
struct
NormDataType
enum
struct
SoftmaxDataType
{
F32_F32
,
// in, out
F16_F16
,
...
...
@@ -60,7 +32,7 @@ enum struct NormDataType
};
// clang-format off
template
<
typename
Norm
DataType
>
std
::
string
type_to_string
();
template
<
typename
Softmax
DataType
>
std
::
string
type_to_string
();
template
<
>
std
::
string
type_to_string
<
float
>
()
{
return
"f32"
;
}
template
<
>
std
::
string
type_to_string
<
half_t
>
()
{
return
"f16"
;
}
template
<
>
std
::
string
type_to_string
<
bhalf_t
>
()
{
return
"bf16"
;
}
...
...
@@ -69,7 +41,7 @@ template <> std::string type_to_string<int32_t>() { return "int32"; }
// clang-format on
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
>
void
profile_softmax_impl
(
int
do_verification
,
bool
profile_softmax_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
...
...
@@ -77,8 +49,7 @@ void profile_softmax_impl(int do_verification,
std
::
vector
<
index_t
>
in_strides
,
std
::
vector
<
index_t
>
reduce_dims
,
AccDataType
alpha
,
AccDataType
beta
,
NormType
norm_type
)
AccDataType
beta
)
{
if
(
Rank
!=
in_length
.
size
())
{
...
...
@@ -88,62 +59,46 @@ void profile_softmax_impl(int do_verification,
Tensor
<
InDataType
>
in
=
in_strides
.
empty
()
?
Tensor
<
InDataType
>
(
in_length
)
:
Tensor
<
InDataType
>
(
in_length
,
in_strides
);
Tensor
<
OutDataType
>
out
(
in
.
mDesc
);
Tensor
<
OutDataType
>
prior_out
(
in
.
mDesc
);
switch
(
init_method
)
{
// case 0: break;
case
0
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{});
out
.
GenerateTensorValue
(
GeneratorTensor_1
<
OutDataType
>
{});
break
;
case
0
:
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
ck
::
utils
::
FillUniformDistributionIntegerValue
<
InDataType
>
{
-
5.
f
,
5.
f
}(
in
.
begin
(),
in
.
end
());
ck
::
utils
::
FillUniformDistributionIntegerValue
<
OutDataType
>
{
-
5.
f
,
5.
f
}(
prior_out
.
begin
(),
prior_out
.
end
());
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
}
);
out
.
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
-
0.5
,
0.5
}
);
ck
::
utils
::
FillUniformDistribution
<
InDataType
>
{
0.0
f
,
1.0
f
}(
in
);
ck
::
utils
::
FillUniformDistribution
<
OutDataType
>
{
-
0.5
f
,
0.5
f
}(
prior_out
);
}
Tensor
<
OutDataType
>
out_ref
(
out
);
Tensor
<
OutDataType
>
out_ref
(
prior_out
);
if
(
do_verification
)
{
using
ReferenceSoftmax
=
tensor_operation
::
host
::
ReferenceSoftmax
<
InDataType
,
OutDataType
,
AccDataType
>
;
ReferenceSoftmax
{}.
MakeInvoker
().
Run
({
in
,
out_ref
,
alpha
,
beta
,
reduce_dims
});
}
DeviceMem
in_dev
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_dev
(
sizeof
(
OutDataType
)
*
out
.
mDesc
.
GetElementSpaceSize
());
in_dev
.
ToDevice
(
in
.
mData
.
data
());
out_dev
.
ToDevice
(
out
.
mData
.
data
());
DeviceMem
in_dev
(
in
.
GetElementSpaceSizeInBytes
());
DeviceMem
out_dev
(
out
.
GetElementSpaceSizeInBytes
());
in_dev
.
ToDevice
(
in
.
data
());
std
::
vector
<
index_t
>
i
_in
_lengths
(
in
.
mDesc
.
GetLengths
().
begin
(),
in
.
mDesc
.
GetLengths
().
end
());
std
::
vector
<
index_t
>
i
_in
_strides
(
in
.
mDesc
.
GetStrides
().
begin
(),
in
.
mDesc
.
GetStrides
().
end
());
std
::
vector
<
index_t
>
i
n_tensor
_lengths
(
in
.
GetLengths
().
begin
(),
in
.
GetLengths
().
end
());
std
::
vector
<
index_t
>
i
n_tensor
_strides
(
in
.
GetStrides
().
begin
(),
in
.
GetStrides
().
end
());
// add device softmax instances
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceOpPtr
=
tensor_operation
::
device
::
DeviceSoftmaxPtr
<
InDataType
,
AccDataType
,
OutDataType
,
PassThrough
,
PassThrough
,
Rank
>
;
std
::
vector
<
DeviceOpPtr
>
instances
;
using
DeviceOp
=
tensor_operation
::
device
::
DeviceSoftmax
<
InDataType
,
AccDataType
,
OutDataType
,
PassThrough
,
PassThrough
,
Rank
>
;
if
(
norm_type
==
NormType
::
SOFTMAX
)
{
if
constexpr
(
is_same
<
InDataType
,
half_t
>::
value
&&
is_same
<
OutDataType
,
half_t
>::
value
&&
is_same
<
AccDataType
,
float
>::
value
)
{
if
constexpr
(
Rank
==
3
)
tensor_operation
::
device
::
instance
::
add_device_softmax_f16_f16_rank3_instances
(
instances
);
else
if
constexpr
(
Rank
==
4
)
tensor_operation
::
device
::
instance
::
add_device_softmax_f16_f16_rank4_instances
(
instances
);
}
else
if
constexpr
(
is_same
<
InDataType
,
float
>::
value
&&
is_same
<
OutDataType
,
float
>::
value
&&
is_same
<
AccDataType
,
float
>::
value
)
{
if
constexpr
(
Rank
==
3
)
tensor_operation
::
device
::
instance
::
add_device_softmax_f32_f32_rank3_instances
(
instances
);
else
if
constexpr
(
Rank
==
4
)
tensor_operation
::
device
::
instance
::
add_device_softmax_f32_f32_rank4_instances
(
instances
);
}
}
// get device op instances
const
auto
instances
=
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
instances
.
size
()
<<
" instances"
<<
std
::
endl
;
if
(
instances
.
size
()
<=
0
)
{
...
...
@@ -153,21 +108,19 @@ void profile_softmax_impl(int do_verification,
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
std
::
vector
<
bool
>
instance_pass
;
for
(
auto
&
inst_ptr
:
instances
)
{
// Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
// problem to rank 4 kernel) other than invoking IsSupportedArgument()?
if
(
!
(
inst_ptr
->
GetRank
()
==
static_cast
<
index_t
>
(
i_in_lengths
.
size
())
&&
inst_ptr
->
GetNumReduceDim
()
==
static_cast
<
index_t
>
(
reduce_dims
.
size
())))
if
(
!
(
inst_ptr
->
GetNumReduceDim
()
==
static_cast
<
index_t
>
(
reduce_dims
.
size
())))
{
continue
;
}
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
i
_in
_lengths
,
i
_in
_strides
,
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
i
n_tensor
_lengths
,
i
n_tensor
_strides
,
reduce_dims
,
&
alpha
,
&
beta
,
...
...
@@ -181,45 +134,42 @@ void profile_softmax_impl(int do_verification,
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
;
LogRange
(
std
::
cout
<<
"input lengths = ["
,
in_length
,
", "
)
<<
"], "
<<
"scaler = ["
<<
alpha
<<
", "
<<
beta
<<
"]."
<<
std
::
endl
;
return
;
<<
"scaler = ["
<<
alpha
<<
", "
<<
beta
<<
"]"
;
LogRange
(
std
::
cout
<<
", reduce dims = ["
,
reduce_dims
,
", "
)
<<
"]."
<<
std
::
endl
;
instance_pass
.
push_back
(
true
);
continue
;
}
out_dev
.
ToDevice
(
prior_out
.
data
());
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes
=
in
.
mDesc
.
GetElementSize
()
*
sizeof
(
InDataType
)
+
(
beta
==
0.0
f
?
1
:
2
)
*
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
OutDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
{
std
::
size_t
num_bytes
=
in
.
GetElementSize
()
*
sizeof
(
InDataType
)
+
(
beta
==
0.0
f
?
1
:
2
)
*
out
.
GetElementSize
()
*
sizeof
(
OutDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_instance_name
=
inst_ptr
->
GetTypeString
();
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
if
(
avg_time
<
best_avg_time
)
{
best_instance_name
=
inst_ptr
->
GetTypeString
();
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
}
if
(
do_verification
)
{
// TODO: factory method to dynamically switch between different reference normalizations
using
ReferenceFactory
=
tensor_operation
::
host
::
ReferenceSoftmax
<
InDataType
,
OutDataType
,
AccDataType
>
;
ReferenceFactory
{}.
MakeInvoker
().
Run
({
in
,
out_ref
,
alpha
,
beta
,
reduce_dims
});
out_dev
.
FromDevice
(
out
.
mData
.
data
());
bool
pass
;
out_dev
.
FromDevice
(
out
.
data
());
bool
pass
=
true
;
if
(
std
::
is_same
<
InDataType
,
int8_t
>::
value
)
{
pass
=
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
,
"Error: Incorrect results!"
,
0
,
1
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
,
"Error: Incorrect results!"
,
0
,
1
);
if
(
do_log
)
{
LogRangeAsType
<
int
>
(
std
::
cout
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
...
...
@@ -230,7 +180,7 @@ void profile_softmax_impl(int do_verification,
}
else
{
pass
=
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
...
...
@@ -247,16 +197,22 @@ void profile_softmax_impl(int do_verification,
<<
"], "
<<
"scaler = ["
<<
alpha
<<
", "
<<
beta
<<
"]."
<<
std
::
endl
;
}
instance_pass
.
push_back
(
pass
);
}
}
std
::
cout
<<
"Best Perf for datatype = "
<<
type_to_string
<
InDataType
>
()
<<
"_"
<<
type_to_string
<
OutDataType
>
()
<<
", "
;
LogRange
(
std
::
cout
<<
"length = "
,
i_in_lengths
,
","
)
<<
", "
;
LogRange
(
std
::
cout
<<
"stride = "
,
i_in_strides
,
","
)
<<
", "
;
LogRange
(
std
::
cout
<<
"reduce dims "
,
reduce_dims
,
","
)
<<
", "
;
std
::
cout
<<
"alpha = "
<<
alpha
<<
", "
<<
"beta = "
<<
beta
<<
", "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
if
(
time_kernel
)
{
std
::
cout
<<
"Best Perf for datatype = "
<<
type_to_string
<
InDataType
>
()
<<
"_"
<<
type_to_string
<
OutDataType
>
()
<<
", "
;
LogRange
(
std
::
cout
<<
"length = "
,
in_tensor_lengths
,
","
)
<<
", "
;
LogRange
(
std
::
cout
<<
"stride = "
,
in_tensor_strides
,
","
)
<<
", "
;
LogRange
(
std
::
cout
<<
"reduce dims "
,
reduce_dims
,
","
)
<<
", "
;
std
::
cout
<<
"alpha = "
<<
alpha
<<
", "
<<
"beta = "
<<
beta
<<
", "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
}
return
std
::
all_of
(
std
::
begin
(
instance_pass
),
std
::
end
(
instance_pass
),
[](
bool
p
)
{
return
p
;
});
}
}
// namespace profiler
...
...
profiler/src/profile_softmax.cpp
View file @
9608beee
...
...
@@ -8,14 +8,10 @@
#include "profiler/include/profile_softmax_impl.hpp"
using
ck
::
index_t
;
using
ck
::
profiler
::
NormDataType
;
using
ck
::
profiler
::
NormType
;
using
ck
::
profiler
::
SoftmaxDataType
;
struct
ArgParser
{
std
::
unordered_map
<
std
::
string
,
NormType
>
norm_dict
=
{{
"batchnorm"
,
NormType
::
BATCHNORM
},
{
"softmax"
,
NormType
::
SOFTMAX
}};
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int
>>
long_opts
=
{
{
"length"
,
{}},
{
"stride"
,
{}},
{
"reduce"
,
{}},
{
"alpha"
,
{}},
{
"beta"
,
{}}};
...
...
@@ -50,7 +46,7 @@ struct ArgParser
void
print_help
()
{
std
::
cout
<<
"arg1: tensor operation (
batchnorm/
softmax)
\n
"
std
::
cout
<<
"arg1: tensor operation (softmax)
\n
"
<<
"arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)
\n
"
<<
"arg3: verification (0: no; 1: yes)
\n
"
<<
"arg4: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
...
...
@@ -64,7 +60,7 @@ void print_help()
<<
std
::
endl
;
}
int
profile_
normalization
(
int
argc
,
char
*
argv
[])
int
profile_
softmax
(
int
argc
,
char
*
argv
[])
{
if
(
argc
<=
2
)
{
...
...
@@ -75,12 +71,11 @@ int profile_normalization(int argc, char* argv[])
ArgParser
arg_parser
;
// short unnamed options
const
NormType
norm_type
=
arg_parser
.
norm_dict
[
argv
[
1
]];
const
NormDataType
data_type
=
static_cast
<
NormDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
3
]);
const
int
init_method
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
6
]);
const
SoftmaxDataType
data_type
=
static_cast
<
SoftmaxDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
3
]);
const
int
init_method
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
6
]);
// parse the long options
arg_parser
(
argc
,
argv
);
...
...
@@ -91,9 +86,10 @@ int profile_normalization(int argc, char* argv[])
arg_parser
.
long_opts
[
"alpha"
].
empty
()
?
1
:
arg_parser
.
long_opts
[
"alpha"
][
0
];
const
index_t
beta
=
arg_parser
.
long_opts
[
"beta"
].
empty
()
?
0
:
arg_parser
.
long_opts
[
"beta"
][
0
];
// Rank 3
if
(
length
.
size
()
==
3
)
{
if
(
data_type
==
Norm
DataType
::
F16_F16
)
if
(
data_type
==
Softmax
DataType
::
F16_F16
)
{
ck
::
profiler
::
profile_softmax_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
,
3
>
(
do_verification
,
init_method
,
...
...
@@ -103,10 +99,9 @@ int profile_normalization(int argc, char* argv[])
stride
,
reduce
,
float
(
alpha
),
float
(
beta
),
norm_type
);
float
(
beta
));
}
else
if
(
data_type
==
Norm
DataType
::
F32_F32
)
else
if
(
data_type
==
Softmax
DataType
::
F32_F32
)
{
ck
::
profiler
::
profile_softmax_impl
<
float
,
float
,
float
,
3
>
(
do_verification
,
init_method
,
...
...
@@ -116,17 +111,17 @@ int profile_normalization(int argc, char* argv[])
stride
,
reduce
,
float
(
alpha
),
float
(
beta
),
norm_type
);
float
(
beta
));
}
else
{
throw
std
::
runtime_error
(
"not implemented yet"
);
}
}
// Rank 4
else
if
(
length
.
size
()
==
4
)
{
if
(
data_type
==
Norm
DataType
::
F16_F16
)
if
(
data_type
==
Softmax
DataType
::
F16_F16
)
{
ck
::
profiler
::
profile_softmax_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
,
4
>
(
do_verification
,
init_method
,
...
...
@@ -136,10 +131,9 @@ int profile_normalization(int argc, char* argv[])
stride
,
reduce
,
float
(
alpha
),
float
(
beta
),
norm_type
);
float
(
beta
));
}
else
if
(
data_type
==
Norm
DataType
::
F32_F32
)
else
if
(
data_type
==
Softmax
DataType
::
F32_F32
)
{
ck
::
profiler
::
profile_softmax_impl
<
float
,
float
,
float
,
4
>
(
do_verification
,
init_method
,
...
...
@@ -149,8 +143,7 @@ int profile_normalization(int argc, char* argv[])
stride
,
reduce
,
float
(
alpha
),
float
(
beta
),
norm_type
);
float
(
beta
));
}
else
{
...
...
profiler/src/profiler.cpp
View file @
9608beee
...
...
@@ -20,7 +20,7 @@ int profile_conv_fwd_bias_relu_add(int, char*[]);
int
profile_conv_bwd_data
(
int
,
char
*
[]);
int
profile_conv_bwd_weight
(
int
,
char
*
[]);
int
profile_grouped_conv_fwd
(
int
,
char
*
[]);
int
profile_
normalization
(
int
,
char
*
[]);
int
profile_
softmax
(
int
,
char
*
[]);
int
profile_layernorm
(
int
,
char
*
[]);
int
profile_groupnorm
(
int
,
char
*
[]);
int
profile_reduce
(
int
,
char
*
[]);
...
...
@@ -45,6 +45,7 @@ static void print_helper_message()
" conv_bwd_data: Convolution Backward Data
\n
"
" conv_bwd_weight: Convolution Backward Weight
\n
"
" grouped_conv_fwd: Grouped Convolution Forward
\n
"
" softmax: Softmax
\n
"
" reduce: Reduce
\n
"
);
// clang-format on
}
...
...
@@ -129,9 +130,9 @@ int main(int argc, char* argv[])
{
return
profile_reduce
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"batchnorm"
)
==
0
||
strcmp
(
argv
[
1
],
"softmax"
)
==
0
)
else
if
(
strcmp
(
argv
[
1
],
"softmax"
)
==
0
)
{
return
profile_
normalization
(
argc
,
argv
);
return
profile_
softmax
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"layernorm"
)
==
0
)
{
...
...
script/cmake-ck-dev.sh
View file @
9608beee
...
...
@@ -11,7 +11,7 @@ cmake
-D
CMAKE_CXX_FLAGS
=
"-O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=
$PWD
"
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
BUILD_DEV
=
ON
\
-D
GPU_TARGETS
=
gfx908
;
gfx90a
\
-D
GPU_TARGETS
=
"
gfx908;gfx90a
"
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
USE_BITINT_EXTENSION_INT4
=
OFF
\
${
MY_PROJECT_SOURCE
}
...
...
script/cmake-ck-release.sh
View file @
9608beee
...
...
@@ -11,7 +11,7 @@ cmake
-D
CMAKE_CXX_FLAGS
=
"-O3"
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
BUILD_DEV
=
OFF
\
-D
GPU_TARGETS
=
gfx908
;
gfx90a
\
-D
GPU_TARGETS
=
"
gfx908;gfx90a
"
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
USE_BITINT_EXTENSION_INT4
=
OFF
\
${
MY_PROJECT_SOURCE
}
...
...
test/CMakeLists.txt
View file @
9608beee
...
...
@@ -26,7 +26,7 @@ function(add_gtest_executable TEST_NAME)
# suppress gtest warnings
target_compile_options
(
${
TEST_NAME
}
PRIVATE -Wno-global-constructors -Wno-undef
)
target_link_libraries
(
${
TEST_NAME
}
PRIVATE gtest_main
)
gtest_discover_tests
(
${
TEST_NAME
}
)
add_test
(
NAME
${
TEST_NAME
}
COMMAND $<TARGET_FILE:
${
TEST_NAME
}
>
)
rocm_install
(
TARGETS
${
TEST_NAME
}
COMPONENT tests
)
endfunction
(
add_gtest_executable TEST_NAME
)
...
...
@@ -52,3 +52,4 @@ add_subdirectory(block_to_ctile_map)
add_subdirectory
(
softmax
)
add_subdirectory
(
normalization
)
add_subdirectory
(
data_type
)
add_subdirectory
(
elementwise_normalization
)
test/elementwise_normalization/CMakeLists.txt
0 → 100644
View file @
9608beee
add_custom_target
(
test_elementwise_normalization
)
add_gtest_executable
(
test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp
)
target_link_libraries
(
test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance
)
add_dependencies
(
test_elementwise_normalization test_elementwise_layernorm_fp16
)
test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
0 → 100644
View file @
9608beee
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/include/profile_elementwise_layernorm_impl.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ck
::
index_t
;
template
<
typename
Tuple
>
class
TestElementwiseLayernorm
:
public
::
testing
::
Test
{
protected:
using
ADataType
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
BDataType
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
using
GammaDataType
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
using
BetaDataType
=
std
::
tuple_element_t
<
3
,
Tuple
>
;
using
AccDataType
=
std
::
tuple_element_t
<
4
,
Tuple
>
;
using
YDataType
=
std
::
tuple_element_t
<
5
,
Tuple
>
;
void
Run
()
{
// M, N
std
::
vector
<
std
::
vector
<
ck
::
index_t
>>
lengths
=
{
{
1
,
1
},
{
25
,
16
},
{
39
,
777
},
{
100
,
200
},
{
1024
,
1024
},
{
48
*
256
,
2048
}};
for
(
auto
length
:
lengths
)
{
bool
success
=
ck
::
profiler
::
profile_elementwise_layernorm_impl
<
ADataType
,
BDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
>
(
true
,
2
,
false
,
false
,
length
);
EXPECT_TRUE
(
success
);
}
}
};
using
KernelTypes
=
::
testing
::
Types
<
// ADataType, BDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
std
::
tuple
<
F16
,
F16
,
F16
,
F16
,
F32
,
F16
>>
;
TYPED_TEST_SUITE
(
TestElementwiseLayernorm
,
KernelTypes
);
TYPED_TEST
(
TestElementwiseLayernorm
,
Test_FP16
)
{
this
->
Run
();
}
test/normalization/CMakeLists.txt
View file @
9608beee
...
...
@@ -3,10 +3,11 @@ add_custom_target(test_layernorm)
add_gtest_executable
(
test_layernorm2d_fp32 test_layernorm2d_fp32.cpp
)
add_gtest_executable
(
test_layernorm2d_fp16 test_layernorm2d_fp16.cpp
)
add_gtest_executable
(
test_groupnorm_fp16 test_groupnorm_fp16.cpp
)
add_gtest_executable
(
test_groupnorm_fp32 test_groupnorm_fp32.cpp
)
add_gtest_executable
(
test_groupnorm_fp32 test_groupnorm_fp32.cpp
)
target_link_libraries
(
test_layernorm2d_fp32 PRIVATE utility
)
target_link_libraries
(
test_layernorm2d_fp16 PRIVATE utility
)
target_link_libraries
(
test_layernorm2d_fp32 PRIVATE utility device_normalization_instance
)
target_link_libraries
(
test_layernorm2d_fp16 PRIVATE utility device_normalization_instance
)
target_link_libraries
(
test_groupnorm_fp16 PRIVATE utility device_normalization_instance
)
target_link_libraries
(
test_groupnorm_fp32 PRIVATE utility device_normalization_instance
)
...
...
@@ -14,4 +15,3 @@ add_dependencies(test_layernorm test_layernorm2d_fp32)
add_dependencies
(
test_layernorm test_layernorm2d_fp16
)
add_dependencies
(
test_layernorm test_groupnorm_fp16
)
add_dependencies
(
test_layernorm test_groupnorm_fp32
)
test/normalization/test_groupnorm_fp16.cpp
View file @
9608beee
...
...
@@ -20,7 +20,7 @@ class TestGroupnorm : public ::testing::Test
void
Run
()
{
// N, H, W, G, C
//
[
N, H, W, G, C
], reduce H, W, C
std
::
vector
<
std
::
vector
<
ck
::
index_t
>>
lengths
=
{{
1
,
1
,
1
,
1
,
1
},
{
1
,
2
,
3
,
4
,
5
},
{
256
,
9
,
9
,
9
,
9
},
...
...
test/normalization/test_groupnorm_fp32.cpp
View file @
9608beee
...
...
@@ -20,7 +20,7 @@ class TestGroupnorm : public ::testing::Test
void
Run
()
{
// N, H, W, G, C
//
[
N, H, W, G, C
], reduce H, W, C
std
::
vector
<
std
::
vector
<
ck
::
index_t
>>
lengths
=
{{
1
,
1
,
1
,
1
,
1
},
{
1
,
2
,
3
,
4
,
5
},
{
256
,
9
,
9
,
9
,
9
},
...
...
Prev
1
…
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment