Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7277329e
"vscode:/vscode.git/clone" did not exist on "c3df6d87d42ed0f4b047405a558b9850ff7b888e"
Commit
7277329e
authored
Apr 22, 2023
by
Jing Zhang
Committed by
root
Apr 22, 2023
Browse files
minor fix
parent
c38d8fdc
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
5 additions
and
7 deletions
+5
-7
example/15_grouped_gemm/run_grouped_gemm_example.inc
example/15_grouped_gemm/run_grouped_gemm_example.inc
+1
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
...tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+3
-3
profiler/include/profiler/profile_grouped_gemm_impl.hpp
profiler/include/profiler/profile_grouped_gemm_impl.hpp
+1
-4
No files found.
example/15_grouped_gemm/run_grouped_gemm_example.inc
View file @
7277329e
...
...
@@ -147,6 +147,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
#else
a_tensors_device
[
i
]
->
ToDevice
(
a_tensors
[
i
]
.
mData
.
data
());
b_tensors_device
[
i
]
->
ToDevice
(
b_tensors
[
i
]
.
mData
.
data
());
c_tensors_device
[
i
]
->
SetZero
();
#endif
p_a
.
push_back
(
a_tensors_device
[
i
]
->
GetDeviceBuffer
());
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
View file @
7277329e
...
...
@@ -533,9 +533,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
return
;
}
const
index_t
block_m_id
=
__builtin_amdgcn_readfirstlane
(
block
Idx
.
y
);
const
index_t
block_n_id
=
__builtin_amdgcn_readfirstlane
(
block
Idx
.
x
);
const
index_t
k_batch_id
=
__builtin_amdgcn_readfirstlane
(
block
Idx
.
z
);
const
index_t
block_m_id
=
__builtin_amdgcn_readfirstlane
(
block
_work_idx
[
I1
]
);
const
index_t
block_n_id
=
__builtin_amdgcn_readfirstlane
(
block
_work_idx
[
I2
]
);
const
index_t
k_batch_id
=
__builtin_amdgcn_readfirstlane
(
block
_work_idx
[
I0
]
);
// HACK: this force m/n_block_data_idx_on_grid into SGPR
const
index_t
m_block_data_idx_on_grid
=
...
...
profiler/include/profiler/profile_grouped_gemm_impl.hpp
View file @
7277329e
...
...
@@ -98,8 +98,6 @@ bool profile_grouped_gemm_impl(int do_verification,
a_m_k
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
},
num_thread
);
b_k_n
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
},
num_thread
);
}
c_m_n_device_results
[
i
].
GenerateTensorValue
(
GeneratorTensor_0
<
CDataType
>
{},
num_thread
);
}
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
...
...
@@ -134,13 +132,12 @@ bool profile_grouped_gemm_impl(int do_verification,
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_m_k
[
i
].
mDesc
.
GetElementSpaceSize
()));
b_device_buf
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
BDataType
)
*
b_k_n
[
i
].
mDesc
.
GetElementSpaceSize
()));
c_device_buf
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
CDataType
)
*
c_m_n_device_results
[
i
].
mDesc
.
GetElementSpaceSize
()));
a_device_buf
[
i
]
->
ToDevice
(
a_m_k
[
i
].
mData
.
data
());
b_device_buf
[
i
]
->
ToDevice
(
b_k_n
[
i
].
mData
.
data
());
c_device_buf
[
i
]
->
ToDevice
(
c_m_n_device_results
[
i
].
mData
.
data
()
);
c_device_buf
[
i
]
->
SetZero
(
);
gemm_descs
.
push_back
({
Ms
[
i
],
Ns
[
i
],
Ks
[
i
],
StrideAs
[
i
],
StrideBs
[
i
],
StrideCs
[
i
],
{}});
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment