Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
6af8e20b
Commit
6af8e20b
authored
Mar 12, 2022
by
Jing Zhang
Browse files
clean
parent
e3a4b967
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
32 additions
and
127 deletions
+32
-127
example/14_grouped_gemm/grouped_gemm_xdl_fp16.cpp
example/14_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+1
-1
include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
...k/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+1
-1
profiler/include/profile_grouped_gemm_impl.hpp
profiler/include/profile_grouped_gemm_impl.hpp
+30
-125
No files found.
example/14_grouped_gemm/grouped_gemm_xdl_fp16.cpp
View file @
6af8e20b
...
@@ -161,7 +161,7 @@ int main(int argc, char* argv[])
...
@@ -161,7 +161,7 @@ int main(int argc, char* argv[])
a_tensors_device
.
push_back
(
a_tensors_device
.
push_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSize
()));
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSize
()));
b_tensors_device
.
push_back
(
b_tensors_device
.
push_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
BDataType
)
*
a
_tensors
[
i
].
mDesc
.
GetElementSize
()));
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
BDataType
)
*
b
_tensors
[
i
].
mDesc
.
GetElementSize
()));
c_tensors_device
.
push_back
(
std
::
make_unique
<
DeviceMem
>
(
c_tensors_device
.
push_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
CDataType
)
*
c_device_tensors
[
i
].
mDesc
.
GetElementSize
()));
sizeof
(
CDataType
)
*
c_device_tensors
[
i
].
mDesc
.
GetElementSize
()));
...
...
include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
View file @
6af8e20b
...
@@ -54,7 +54,7 @@ template <typename ADataType,
...
@@ -54,7 +54,7 @@ template <typename ADataType,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
ck
::
index_t
NumPrefetch
=
1
,
ck
::
index_t
NumPrefetch
=
1
,
ck
::
index_t
MaxGroupCount
=
5
>
ck
::
index_t
MaxGroupCount
=
10
>
struct
DeviceGroupedGemmXdl
struct
DeviceGroupedGemmXdl
:
public
DeviceGroupedGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
:
public
DeviceGroupedGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
{
{
...
...
profiler/include/profile_grouped_gemm_impl.hpp
View file @
6af8e20b
...
@@ -73,8 +73,6 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -73,8 +73,6 @@ void profile_grouped_gemm_impl(int do_verification,
std
::
vector
<
Tensor
<
BDataType
>>
b_k_n
;
std
::
vector
<
Tensor
<
BDataType
>>
b_k_n
;
std
::
vector
<
Tensor
<
CDataType
>>
c_m_n_device_results
;
std
::
vector
<
Tensor
<
CDataType
>>
c_m_n_device_results
;
// int A_size = 0, B_size = 0, C_size = 0;
for
(
int
i
=
0
;
i
<
Ms
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
Ms
.
size
();
i
++
)
{
{
a_m_k
.
push_back
(
a_m_k
.
push_back
(
...
@@ -104,12 +102,7 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -104,12 +102,7 @@ void profile_grouped_gemm_impl(int do_verification,
b_k_n
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
},
num_thread
);
b_k_n
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
},
num_thread
);
}
}
// set zero to c_device_buf
c_m_n_device_results
[
i
].
GenerateTensorValue
(
GeneratorTensor_0
<
CDataType
>
{},
num_thread
);
c_m_n_device_results
[
i
].
GenerateTensorValue
(
GeneratorTensor_0
<
CDataType
>
{},
num_thread
);
// A_size += a_m_k[i].mDesc.GetElementSpace();
// B_size += b_k_n[i].mDesc.GetElementSpace();
// C_size += c_m_n_device_results[i].mDesc.GetElementSpace();
}
}
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
...
@@ -125,85 +118,23 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -125,85 +118,23 @@ void profile_grouped_gemm_impl(int do_verification,
// }
// }
// std::vector<DeviceMem> a_device_buf, b_device_buf, c_device_buf;
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
std
::
vector
<
DeviceMemPtr
>
a_device_buf
,
b_device_buf
,
c_device_buf
;
std
::
vector
<
void
*>
a_device_buf
,
b_device_buf
,
c_device_buf
;
// DeviceMem a_device_buf_(sizeof(ADataType) * A_size);
// DeviceMem b_device_buf_(sizeof(BDataType) * B_size);
// DeviceMem c_device_buf_(sizeof(CDataType) * C_size);
// std::vector<ADataType> a_tensors_data;
// std::vector<BDataType> b_tensors_data;
// std::vector<CDataType> c_tensors_data;
std
::
vector
<
GemmShape
>
gemm_shapes
;
std
::
vector
<
GemmShape
>
gemm_shapes
;
// A_size = 0;
// B_size = 0;
// C_size = 0;
for
(
int
i
=
0
;
i
<
Ms
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
Ms
.
size
();
i
++
)
{
{
// a_tensors_data.insert(a_tensors_data.end(), a_m_k[i].mData.begin(),
a_device_buf
.
push_back
(
// a_m_k[i].mData.end()); b_tensors_data.insert(b_tensors_data.end(),
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_m_k
[
i
].
mDesc
.
GetElementSize
()));
// b_k_n[i].mData.begin(), b_k_n[i].mData.end());
b_device_buf
.
push_back
(
// c_tensors_data.insert(c_tensors_data.end(), c_m_n_device_results[i].mData.begin(),
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
BDataType
)
*
b_k_n
[
i
].
mDesc
.
GetElementSize
()));
// c_m_n_device_results[i].mData.end());
c_device_buf
.
push_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
CDataType
)
*
c_m_n_device_results
[
i
].
mDesc
.
GetElementSize
()));
void
*
a_device_buf_
,
*
b_device_buf_
,
*
c_device_buf_
;
hipGetErrorString
(
hipMalloc
(
static_cast
<
void
**>
(
&
a_device_buf_
),
a_device_buf
[
i
]
->
ToDevice
(
a_m_k
[
i
].
mData
.
data
());
sizeof
(
ADataType
)
*
a_m_k
[
i
].
mDesc
.
GetElementSpace
()));
b_device_buf
[
i
]
->
ToDevice
(
b_k_n
[
i
].
mData
.
data
());
hipGetErrorString
(
hipMalloc
(
static_cast
<
void
**>
(
&
b_device_buf_
),
c_device_buf
[
i
]
->
ToDevice
(
c_m_n_device_results
[
i
].
mData
.
data
());
sizeof
(
BDataType
)
*
b_k_n
[
i
].
mDesc
.
GetElementSpace
()));
hipGetErrorString
(
hipMalloc
(
static_cast
<
void
**>
(
&
c_device_buf_
),
sizeof
(
CDataType
)
*
c_m_n_device_results
[
i
].
mDesc
.
GetElementSpace
()));
// DeviceMem a_device_buf_(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpace());
// DeviceMem b_device_buf_(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpace());
// DeviceMem c_device_buf_(sizeof(CDataType) *
// c_m_n_device_results[i].mDesc.GetElementSpace());
hipGetErrorString
(
hipMemcpy
(
a_device_buf_
,
a_m_k
[
i
].
mData
.
data
(),
sizeof
(
ADataType
)
*
a_m_k
[
i
].
mDesc
.
GetElementSpace
(),
hipMemcpyHostToDevice
));
hipGetErrorString
(
hipMemcpy
(
b_device_buf_
,
b_k_n
[
i
].
mData
.
data
(),
sizeof
(
BDataType
)
*
b_k_n
[
i
].
mDesc
.
GetElementSpace
(),
hipMemcpyHostToDevice
));
hipGetErrorString
(
hipMemcpy
(
c_device_buf_
,
c_m_n_device_results
[
i
].
mData
.
data
(),
sizeof
(
CDataType
)
*
c_m_n_device_results
[
i
].
mDesc
.
GetElementSpace
(),
hipMemcpyHostToDevice
));
// a_device_buf_.ToDevice(a_m_k[i].mData.data());
// b_device_buf_.ToDevice(b_k_n[i].mData.data());
// c_device_buf_.ToDevice(c_m_n_device_results[i].mData.data());
a_device_buf
.
push_back
(
a_device_buf_
);
b_device_buf
.
push_back
(
b_device_buf_
);
c_device_buf
.
push_back
(
c_device_buf_
);
// a_device_buf.push_back(a_device_buf_);
// b_device_buf.push_back(b_device_buf_);
// c_device_buf.push_back(c_device_buf_);
// gemm_shapes.push_back({Ms[i],
// Ns[i],
// Ks[i],
// StrideAs[i],
// StrideBs[i],
// StrideCs[i],
// a_device_buf[i].GetDeviceBuffer(),
// b_device_buf[i].GetDeviceBuffer(),
// c_device_buf[i].GetDeviceBuffer()});
// printf("%p %p %p\n",
// a_device_buf[i].GetDeviceBuffer(),
// b_device_buf[i].GetDeviceBuffer(),
// c_device_buf[i].GetDeviceBuffer());
gemm_shapes
.
push_back
({
Ms
[
i
],
gemm_shapes
.
push_back
({
Ms
[
i
],
Ns
[
i
],
Ns
[
i
],
...
@@ -211,19 +142,11 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -211,19 +142,11 @@ void profile_grouped_gemm_impl(int do_verification,
StrideAs
[
i
],
StrideAs
[
i
],
StrideBs
[
i
],
StrideBs
[
i
],
StrideCs
[
i
],
StrideCs
[
i
],
a_device_buf_
,
a_device_buf
[
i
]
->
GetDeviceBuffer
(),
b_device_buf_
,
b_device_buf
[
i
]
->
GetDeviceBuffer
(),
c_device_buf_
});
c_device_buf
[
i
]
->
GetDeviceBuffer
()});
// A_size += a_m_k[i].mDesc.GetElementSpace();
// B_size += b_k_n[i].mDesc.GetElementSpace();
// C_size += c_m_n_device_results[i].mDesc.GetElementSpace();
}
}
// a_device_buf_.ToDevice(a_tensors_data.data());
// b_device_buf_.ToDevice(b_tensors_data.data());
// c_device_buf_.ToDevice(c_tensors_data.data());
// add device GEMM instances
// add device GEMM instances
std
::
vector
<
std
::
vector
<
ck
::
tensor_operation
::
device
::
device_grouped_gemm_instance
::
DeviceGroupedGemmNoOpPtr
>
ck
::
tensor_operation
::
device
::
device_grouped_gemm_instance
::
DeviceGroupedGemmNoOpPtr
>
...
@@ -328,16 +251,19 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -328,16 +251,19 @@ void profile_grouped_gemm_impl(int do_verification,
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
#if 0
std
::
size_t
flop
=
0
,
num_btype
=
0
;
std::size_t flop = std::size_t(2) * M * N * K;
for
(
int
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
std::size_t num_btype =
flop
+=
std
::
size_t
(
2
)
*
Ms
[
i
]
*
Ns
[
i
]
*
Ks
[
i
];
sizeof(ADataType) * M * K + sizeof(BDataType) * K * M + sizeof(CDataType) * M * N;
num_btype
+=
sizeof
(
ADataType
)
*
Ms
[
i
]
*
Ks
[
i
]
+
sizeof
(
BDataType
)
*
Ks
[
i
]
*
Ms
[
i
]
+
sizeof
(
CDataType
)
*
Ms
[
i
]
*
Ns
[
i
];
}
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm_name
<<
std
::
endl
;
<<
gb_per_sec
<<
" GB/s, "
<<
gemm_name
<<
std
::
endl
;
...
@@ -348,33 +274,13 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -348,33 +274,13 @@ void profile_grouped_gemm_impl(int do_verification,
best_ave_time
=
ave_time
;
best_ave_time
=
ave_time
;
best_gb_per_sec
=
gb_per_sec
;
best_gb_per_sec
=
gb_per_sec
;
}
}
#endif
if
(
do_verification
)
if
(
do_verification
)
{
{
// c_tensors_data.resize(C_size);
// c_device_buf_.FromDevice(c_tensors_data.data());
// C_size = 0;
// for(int i = 0; i < gemm_shapes.size(); i++)
//{
// memcpy(c_m_n_device_results[i].mData.data(),
// c_tensors_data.data() + C_size,
// c_m_n_device_results[i].mDesc.GetElementSpace() * sizeof(CDataType));
// C_size += c_m_n_device_results[i].mDesc.GetElementSpace();
//}
for
(
int
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
{
hipGetErrorString
(
hipMemcpy
(
c_m_n_device_results
[
i
].
mData
.
data
(),
c_device_buf
[
i
],
sizeof
(
CDataType
)
*
c_m_n_device_results
[
i
].
mDesc
.
GetElementSpace
(),
hipMemcpyDeviceToHost
));
// hipGetErrorString(hipFree(c_device_buf[i]
));
c_device_buf
[
i
]
->
FromDevice
(
c_m_n_device_results
[
i
].
mData
.
data
(
));
Tensor
<
CDataType
>
c_m_n_host_result
(
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
StrideCs
[
i
],
CLayout
{}));
f_host_tensor_descriptor
(
Ms
[
i
],
Ns
[
i
],
StrideCs
[
i
],
CLayout
{}));
...
@@ -402,16 +308,15 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -402,16 +308,15 @@ void profile_grouped_gemm_impl(int do_verification,
if
(
do_log
)
if
(
do_log
)
{
{
// LogRangeAsType<float>(std::cout << "a : ", a_m_k[i].mData, ",")
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a : "
,
a_m_k
[
i
].
mData
,
","
)
//<< std::endl;
<<
std
::
endl
;
// LogRangeAsType<float>(std::cout << "b: ", b_k_n[i].mData, ",") <<
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b: "
,
b_k_n
[
i
].
mData
,
","
)
<<
std
::
endl
;
// std::endl;
LogRangeAsType
<
float
>
(
LogRangeAsType
<
float
>
(
std
::
cout
<<
"c_device: "
,
c_m_n_device_results
[
i
].
mData
,
","
)
std
::
cout
<<
"c_device: "
,
c_m_n_device_results
[
i
].
mData
,
","
)
<<
std
::
endl
;
<<
std
::
endl
;
//
LogRangeAsType<float>(
LogRangeAsType
<
float
>
(
//
std::cout << "c_host : ", c_m_n_host_result.mData, ",")
std
::
cout
<<
"c_host : "
,
c_m_n_host_result
.
mData
,
","
)
//
<< std::endl;
<<
std
::
endl
;
}
}
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment