Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
57f2d3c3
Commit
57f2d3c3
authored
May 04, 2022
by
myamlak
Browse files
Review remarks
parent
c82abef1
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
25 additions
and
25 deletions
+25
-25
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+3
-3
include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
.../gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+1
-1
include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
...u/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
+1
-1
include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
...k/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+1
-1
library/include/ck/library/host_tensor/host_tensor.hpp
library/include/ck/library/host_tensor/host_tensor.hpp
+1
-1
library/src/host_tensor/host_tensor.cpp
library/src/host_tensor/host_tensor.cpp
+2
-2
profiler/include/profile_convnd_bwd_data_impl.hpp
profiler/include/profile_convnd_bwd_data_impl.hpp
+1
-1
profiler/include/profile_grouped_gemm_impl.hpp
profiler/include/profile_grouped_gemm_impl.hpp
+10
-10
test/gemm_split_k/gemm_split_k.cpp
test/gemm_split_k/gemm_split_k.cpp
+1
-1
test/grouped_gemm/grouped_gemm_fp16.cpp
test/grouped_gemm/grouped_gemm_fp16.cpp
+3
-3
test/reduce/reduce_util.hpp
test/reduce/reduce_util.hpp
+1
-1
No files found.
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
View file @
57f2d3c3
...
@@ -131,7 +131,7 @@ int main(int argc, char* argv[])
...
@@ -131,7 +131,7 @@ int main(int argc, char* argv[])
std
::
size_t
flop
=
0
,
num_btype
=
0
;
std
::
size_t
flop
=
0
,
num_btype
=
0
;
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
gemm_shapes
.
size
()
)
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
{
a_tensors
.
push_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
a_tensors
.
push_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
gemm_shapes
[
i
].
M
,
gemm_shapes
[
i
].
K
,
gemm_shapes
[
i
].
StrideA
,
ALayout
{})));
gemm_shapes
[
i
].
M
,
gemm_shapes
[
i
].
K
,
gemm_shapes
[
i
].
StrideA
,
ALayout
{})));
...
@@ -168,7 +168,7 @@ int main(int argc, char* argv[])
...
@@ -168,7 +168,7 @@ int main(int argc, char* argv[])
}
}
}
}
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
gemm_shapes
.
size
()
)
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
{
a_tensors_device
.
emplace_back
(
a_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSpace
()));
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSpace
()));
...
@@ -213,7 +213,7 @@ int main(int argc, char* argv[])
...
@@ -213,7 +213,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
gemm_shapes
.
size
()
)
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
{
c_tensors_device
[
i
]
->
FromDevice
(
c_device_tensors
[
i
].
mData
.
data
());
c_tensors_device
[
i
]
->
FromDevice
(
c_device_tensors
[
i
].
mData
.
data
());
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_gemm
=
ReferenceGemmInstance
{};
...
...
include/ck/tensor_operation/gpu/device/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
View file @
57f2d3c3
...
@@ -698,7 +698,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
...
@@ -698,7 +698,7 @@ struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
}
}
// Gridwise GEMM size
// Gridwise GEMM size
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
arg
.
a_grid_desc_k0_m_k1_container_
.
size
()
)
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
arg
.
a_grid_desc_k0_m_k1_container_
.
size
();
i
++
)
{
{
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_container_
[
i
],
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_container_
[
i
],
arg
.
b_grid_desc_k0_n_k1_container_
[
i
],
arg
.
b_grid_desc_k0_n_k1_container_
[
i
],
...
...
include/ck/tensor_operation/gpu/device/device_convnd_bwd_data_xdl_ndhwc_kzyxc_ndhwk.hpp
View file @
57f2d3c3
...
@@ -1413,7 +1413,7 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
...
@@ -1413,7 +1413,7 @@ struct DeviceConvndBwdDataXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho
}
}
// Gridwise GEMM size
// Gridwise GEMM size
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
arg
.
a_grid_desc_k0_m_k1_container_
.
size
()
)
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
arg
.
a_grid_desc_k0_m_k1_container_
.
size
();
i
++
)
{
{
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_container_
[
i
],
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_container_
[
i
],
arg
.
b_grid_desc_k0_n_k1_container_
[
i
],
arg
.
b_grid_desc_k0_n_k1_container_
[
i
],
...
...
include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
View file @
57f2d3c3
...
@@ -301,7 +301,7 @@ struct DeviceGroupedGemmXdl
...
@@ -301,7 +301,7 @@ struct DeviceGroupedGemmXdl
gemm_desc_kernel_arg_
.
reserve
(
group_count_
);
gemm_desc_kernel_arg_
.
reserve
(
group_count_
);
for
(
index
_t
i
=
0
;
i
<
ck
::
type_convert
<
index_t
>
(
gemm_shapes
.
size
()
)
;
i
++
)
for
(
std
::
size
_t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
{
const
index_t
M
=
gemm_shapes
[
i
].
M
;
const
index_t
M
=
gemm_shapes
[
i
].
M
;
const
index_t
N
=
gemm_shapes
[
i
].
N
;
const
index_t
N
=
gemm_shapes
[
i
].
N
;
...
...
library/include/ck/library/host_tensor/host_tensor.hpp
View file @
57f2d3c3
...
@@ -316,7 +316,7 @@ float check_error(const Tensor<T>& ref, const Tensor<T>& result)
...
@@ -316,7 +316,7 @@ float check_error(const Tensor<T>& ref, const Tensor<T>& result)
constexpr
float
eps
=
1e-10
;
constexpr
float
eps
=
1e-10
;
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
ref
.
mData
.
size
()
)
;
++
i
)
for
(
std
::
size_
t
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
{
float
ref_v
=
ck
::
type_convert
<
float
>
(
ref
.
mData
[
i
]);
float
ref_v
=
ck
::
type_convert
<
float
>
(
ref
.
mData
[
i
]);
float
result_v
=
ck
::
type_convert
<
float
>
(
result
.
mData
[
i
]);
float
result_v
=
ck
::
type_convert
<
float
>
(
result
.
mData
[
i
]);
...
...
library/src/host_tensor/host_tensor.cpp
View file @
57f2d3c3
...
@@ -25,7 +25,7 @@ std::size_t HostTensorDescriptor::GetElementSize() const
...
@@ -25,7 +25,7 @@ std::size_t HostTensorDescriptor::GetElementSize() const
std
::
size_t
HostTensorDescriptor
::
GetElementSpace
()
const
std
::
size_t
HostTensorDescriptor
::
GetElementSpace
()
const
{
{
std
::
size_t
space
=
1
;
std
::
size_t
space
=
1
;
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
mLens
.
size
()
)
;
++
i
)
for
(
std
::
size_
t
i
=
0
;
i
<
mLens
.
size
();
++
i
)
{
{
space
+=
(
mLens
[
i
]
-
1
)
*
mStrides
[
i
];
space
+=
(
mLens
[
i
]
-
1
)
*
mStrides
[
i
];
}
}
...
@@ -68,7 +68,7 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream
...
@@ -68,7 +68,7 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream
// FIXME: remove
// FIXME: remove
void
bf16_to_f32_
(
const
Tensor
<
ck
::
bhalf_t
>&
src
,
Tensor
<
float
>&
dst
)
void
bf16_to_f32_
(
const
Tensor
<
ck
::
bhalf_t
>&
src
,
Tensor
<
float
>&
dst
)
{
{
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
src
.
mData
.
size
()
)
;
++
i
)
for
(
std
::
size_
t
i
=
0
;
i
<
src
.
mData
.
size
();
++
i
)
dst
.
mData
[
i
]
=
ck
::
type_convert
<
float
>
(
src
.
mData
[
i
]);
dst
.
mData
[
i
]
=
ck
::
type_convert
<
float
>
(
src
.
mData
[
i
]);
}
}
#endif
#endif
profiler/include/profile_convnd_bwd_data_impl.hpp
View file @
57f2d3c3
...
@@ -222,7 +222,7 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
...
@@ -222,7 +222,7 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
{
{
float
max_diff
=
1e-6
;
float
max_diff
=
1e-6
;
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
ref
.
mData
.
size
()
)
;
++
i
)
for
(
std
::
size_
t
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
{
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
if
(
max_diff
<
diff
)
if
(
max_diff
<
diff
)
...
...
profiler/include/profile_grouped_gemm_impl.hpp
View file @
57f2d3c3
...
@@ -50,12 +50,12 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -50,12 +50,12 @@ void profile_grouped_gemm_impl(int do_verification,
int
init_method
,
int
init_method
,
bool
do_log
,
bool
do_log
,
int
nrepeat
,
int
nrepeat
,
std
::
vector
<
int
>
Ms
,
const
std
::
vector
<
int
>
&
Ms
,
std
::
vector
<
int
>
Ns
,
const
std
::
vector
<
int
>
&
Ns
,
std
::
vector
<
int
>
Ks
,
const
std
::
vector
<
int
>
&
Ks
,
std
::
vector
<
int
>
StrideAs
,
const
std
::
vector
<
int
>
&
StrideAs
,
std
::
vector
<
int
>
StrideBs
,
const
std
::
vector
<
int
>
&
StrideBs
,
std
::
vector
<
int
>
StrideCs
)
const
std
::
vector
<
int
>
&
StrideCs
)
{
{
auto
f_host_tensor_descriptor
=
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
...
@@ -83,7 +83,7 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -83,7 +83,7 @@ void profile_grouped_gemm_impl(int do_verification,
std
::
vector
<
Tensor
<
BDataType
>>
b_k_n
;
std
::
vector
<
Tensor
<
BDataType
>>
b_k_n
;
std
::
vector
<
Tensor
<
CDataType
>>
c_m_n_device_results
;
std
::
vector
<
Tensor
<
CDataType
>>
c_m_n_device_results
;
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
Ms
.
size
()
)
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
Ms
.
size
();
i
++
)
{
{
a_m_k
.
push_back
(
a_m_k
.
push_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ks
[
i
],
StrideAs
[
i
],
ALayout
{})));
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ks
[
i
],
StrideAs
[
i
],
ALayout
{})));
...
@@ -144,7 +144,7 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -144,7 +144,7 @@ void profile_grouped_gemm_impl(int do_verification,
gemm_shapes
.
reserve
(
group_count
);
gemm_shapes
.
reserve
(
group_count
);
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
group_count
)
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
group_count
;
i
++
)
{
{
a_device_buf
.
emplace_back
(
a_device_buf
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_m_k
[
i
].
mDesc
.
GetElementSpace
()));
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_m_k
[
i
].
mDesc
.
GetElementSpace
()));
...
@@ -234,7 +234,7 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -234,7 +234,7 @@ void profile_grouped_gemm_impl(int do_verification,
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
std
::
size_t
flop
=
0
,
num_btype
=
0
;
std
::
size_t
flop
=
0
,
num_btype
=
0
;
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
gemm_shapes
.
size
()
)
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
{
flop
+=
std
::
size_t
(
2
)
*
Ms
[
i
]
*
Ns
[
i
]
*
Ks
[
i
];
flop
+=
std
::
size_t
(
2
)
*
Ms
[
i
]
*
Ns
[
i
]
*
Ks
[
i
];
...
@@ -258,7 +258,7 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -258,7 +258,7 @@ void profile_grouped_gemm_impl(int do_verification,
if
(
do_verification
)
if
(
do_verification
)
{
{
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
gemm_shapes
.
size
()
)
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
{
c_device_buf
[
i
]
->
FromDevice
(
c_m_n_device_results
[
i
].
mData
.
data
());
c_device_buf
[
i
]
->
FromDevice
(
c_m_n_device_results
[
i
].
mData
.
data
());
...
...
test/gemm_split_k/gemm_split_k.cpp
View file @
57f2d3c3
...
@@ -45,7 +45,7 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
...
@@ -45,7 +45,7 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
{
{
float
max_diff
=
1e-6
;
float
max_diff
=
1e-6
;
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
ref
.
mData
.
size
()
)
;
++
i
)
for
(
std
::
size_
t
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
{
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
if
(
max_diff
<
diff
)
if
(
max_diff
<
diff
)
...
...
test/grouped_gemm/grouped_gemm_fp16.cpp
View file @
57f2d3c3
...
@@ -104,7 +104,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
...
@@ -104,7 +104,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
b_tensors_device
.
reserve
(
group_count
);
b_tensors_device
.
reserve
(
group_count
);
c_tensors_device
.
reserve
(
group_count
);
c_tensors_device
.
reserve
(
group_count
);
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
gemm_shapes
.
size
()
)
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
{
a_tensors
.
emplace_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
a_tensors
.
emplace_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
gemm_shapes
[
i
].
M
,
gemm_shapes
[
i
].
K
,
gemm_shapes
[
i
].
StrideA
,
ALayout
{})));
gemm_shapes
[
i
].
M
,
gemm_shapes
[
i
].
K
,
gemm_shapes
[
i
].
StrideA
,
ALayout
{})));
...
@@ -119,7 +119,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
...
@@ -119,7 +119,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
}
}
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
gemm_shapes
.
size
()
)
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
{
a_tensors_device
.
emplace_back
(
a_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSize
()));
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSize
()));
...
@@ -147,7 +147,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
...
@@ -147,7 +147,7 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
invoker_ptr
->
Run
(
argument_ptr
.
get
());
invoker_ptr
->
Run
(
argument_ptr
.
get
());
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
gemm_shapes
.
size
()
)
;
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
{
c_tensors_device
[
i
]
->
FromDevice
(
c_device_tensors
[
i
].
mData
.
data
());
c_tensors_device
[
i
]
->
FromDevice
(
c_device_tensors
[
i
].
mData
.
data
());
...
...
test/reduce/reduce_util.hpp
View file @
57f2d3c3
...
@@ -9,7 +9,7 @@ namespace reduce_util {
...
@@ -9,7 +9,7 @@ namespace reduce_util {
template
<
typename
T
>
template
<
typename
T
>
void
to_f32_vector
(
const
Tensor
<
T
>&
src
,
Tensor
<
float
>&
dst
)
void
to_f32_vector
(
const
Tensor
<
T
>&
src
,
Tensor
<
float
>&
dst
)
{
{
for
(
in
t
i
=
0
;
i
<
ck
::
type_convert
<
int
>
(
src
.
mData
.
size
()
)
;
++
i
)
for
(
std
::
size_
t
i
=
0
;
i
<
src
.
mData
.
size
();
++
i
)
dst
.
mData
[
i
]
=
type_convert
<
float
>
(
src
.
mData
[
i
]);
dst
.
mData
[
i
]
=
type_convert
<
float
>
(
src
.
mData
[
i
]);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment