Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
bdbc0eaa
Commit
bdbc0eaa
authored
Apr 02, 2019
by
Chao Liu
Browse files
cleaning up dead code
parent
7c098ddc
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
6 additions
and
670 deletions
+6
-670
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+1
-1
driver/driver.hip.cpp
driver/driver.hip.cpp
+1
-1
script/cmake-cuda.sh
script/cmake-cuda.sh
+0
-0
script/cmake-hip.sh
script/cmake-hip.sh
+0
-0
src/include/blockwise_batched_gemm.hip.hpp
src/include/blockwise_batched_gemm.hip.hpp
+0
-455
src/include/blockwise_gemm.hip.hpp
src/include/blockwise_gemm.hip.hpp
+4
-213
No files found.
driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
View file @
bdbc0eaa
...
@@ -221,7 +221,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
...
@@ -221,7 +221,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
BlockSize
=
128
;
#elif 0
#elif 0
// 1x1, 14x14, Vega 20, hack CPerBlock = 1
// 1x1, 14x14, Vega 20, hack CPerBlock = 1
for debugging
constexpr
index_t
BPerBlock
=
64
;
constexpr
index_t
BPerBlock
=
64
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
CPerBlock
=
1
;
constexpr
index_t
CPerBlock
=
1
;
...
...
driver/driver.hip.cpp
View file @
bdbc0eaa
...
@@ -580,7 +580,7 @@ int main(int argc, char* argv[])
...
@@ -580,7 +580,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
1
#elif
0
// 1x1 filter, 14x14 image, C = 2048
// 1x1 filter, 14x14 image, C = 2048
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
2048
;
constexpr
index_t
C
=
2048
;
...
...
build
/cmake-cuda.sh
→
script
/cmake-cuda.sh
View file @
bdbc0eaa
File moved
build
/cmake-hip.sh
→
script
/cmake-hip.sh
View file @
bdbc0eaa
File moved
src/include/blockwise_batched_gemm.hip.hpp
View file @
bdbc0eaa
This diff is collapsed.
Click to expand it.
src/include/blockwise_gemm.hip.hpp
View file @
bdbc0eaa
...
@@ -3,215 +3,6 @@
...
@@ -3,215 +3,6 @@
extern
"C"
__attribute__
((
address_space
(
3
)))
void
*
__to_local
(
void
*
p
)[[
hc
]];
extern
"C"
__attribute__
((
address_space
(
3
)))
void
*
__to_local
(
void
*
p
)[[
hc
]];
template
<
index_t
BlockSize
,
class
BlockMatrixA
,
class
BlockMatrixB
,
class
ThreadMatrixC
,
bool
TransA
,
bool
TransB
,
bool
TransC
,
index_t
KPerThreadLoop
,
index_t
MThreadPerCluster
,
index_t
NThreadPerCluster
,
bool
DistributeThreadAlongColumnFirst
>
struct
BlockwiseGemmBlockABlockBThreadC
{
index_t
mMyThreadOffsetA
=
0
;
index_t
mMyThreadOffsetB
=
0
;
struct
MatrixIndex
{
index_t
row
;
index_t
col
;
};
__device__
BlockwiseGemmBlockABlockBThreadC
()
{
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
constexpr
auto
b_block_mtx
=
BlockMatrixB
{};
const
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
mMyThreadOffsetA
=
(
!
TransA
)
?
a_block_mtx
.
Get1dIndex
(
c_thread_mtx_index
.
row
,
0
)
:
a_block_mtx
.
Get1dIndex
(
0
,
c_thread_mtx_index
.
row
);
mMyThreadOffsetB
=
(
!
TransB
)
?
b_block_mtx
.
Get1dIndex
(
0
,
c_thread_mtx_index
.
col
)
:
b_block_mtx
.
Get1dIndex
(
c_thread_mtx_index
.
col
,
0
);
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantMatrixDescriptor(BlockMatrixA{}, "a_block_mtx: ");
print_ConstantMatrixDescriptor(BlockMatrixB{}, "b_block_mtx: ");
print_ConstantMatrixDescriptor(ThreadMatrixC{}, "c_thread_mtx: ");
printf("%u %u, %u %u %u, %u %u\n",
get_block_1d_id(),
get_thread_local_1d_id(),
c_thread_mtx_index.batch,
c_thread_mtx_index.row,
c_thread_mtx_index.col,
mMyThreadOffsetA,
mMyThreadOffsetB);
}
#endif
}
__device__
MatrixIndex
GetBeginOfThreadMatrixC
(
index_t
thread_id
)
const
{
if
(
TransA
&&
(
!
TransB
)
&&
(
!
TransC
))
{
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
constexpr
auto
b_block_mtx
=
BlockMatrixB
{};
static_assert
(
a_block_mtx
.
NRow
()
==
b_block_mtx
.
NRow
(),
"wrong! k dimension not consistent!"
);
constexpr
index_t
MPerBlock
=
a_block_mtx
.
NCol
();
constexpr
index_t
NPerBlock
=
b_block_mtx
.
NCol
();
constexpr
auto
c_thread_mtx
=
ThreadMatrixC
{};
// divide thread work
constexpr
index_t
MPerThread
=
c_thread_mtx
.
NRow
();
constexpr
index_t
NPerThread
=
c_thread_mtx
.
NCol
();
static_assert
(
MPerBlock
%
(
MPerThread
*
MThreadPerCluster
)
==
0
,
"MPerBlock % (MPerThread * MThreadPerCluster) != 0"
);
static_assert
(
NPerBlock
%
(
NPerThread
*
NThreadPerCluster
)
==
0
,
"NPerBlock % (NPerThread * NThreadPerCluster) != 0"
);
constexpr
index_t
MClusterWork
=
(
MPerBlock
+
MPerThread
*
MThreadPerCluster
-
1
)
/
(
MPerThread
*
MThreadPerCluster
);
constexpr
index_t
NClusterWork
=
(
NPerBlock
+
NPerThread
*
NThreadPerCluster
-
1
)
/
(
NPerThread
*
NThreadPerCluster
);
static_assert
(
BlockSize
==
(
MClusterWork
*
MThreadPerCluster
)
*
(
NClusterWork
*
NThreadPerCluster
),
"wrong! wrong BlockSize"
);
if
(
DistributeThreadAlongColumnFirst
)
{
const
index_t
cluster_work_block_id
=
thread_id
/
(
MThreadPerCluster
*
NThreadPerCluster
);
const
index_t
thread_work_cluster_id
=
thread_id
-
cluster_work_block_id
*
(
MThreadPerCluster
*
NThreadPerCluster
);
const
index_t
m_cluster_work_block_id
=
cluster_work_block_id
/
NClusterWork
;
const
index_t
n_cluster_work_block_id
=
cluster_work_block_id
-
m_cluster_work_block_id
*
NClusterWork
;
const
index_t
m_thread_work_cluster_id
=
thread_work_cluster_id
/
NThreadPerCluster
;
const
index_t
n_thread_work_cluster_id
=
thread_work_cluster_id
-
m_thread_work_cluster_id
*
NThreadPerCluster
;
#if 0
if(get_block_1d_id() == 0)
{
printf("%u %u, \t"
"MClusterWork %u MThreadPerCluster %u NClusterWork %u NThreadPerCluster %u \t"
"m_cluster_work_block_id %u n_cluster_work_block_id %u \t"
"m_thread_work_cluster_id %u n_thread_work_cluster_id %u \t"
"\n",
get_block_1d_id(), get_thread_local_1d_id(),
MClusterWork, MThreadPerCluster, NClusterWork, NThreadPerCluster,
m_cluster_work_block_id, n_cluster_work_block_id,
m_thread_work_cluster_id, n_thread_work_cluster_id);
}
#endif
return
MatrixIndex
{
m_cluster_work_block_id
*
(
MThreadPerCluster
*
MPerThread
)
+
m_thread_work_cluster_id
*
MPerThread
,
n_cluster_work_block_id
*
(
NThreadPerCluster
*
NPerThread
)
+
n_thread_work_cluster_id
*
NPerThread
};
}
else
{
// not implemented
assert
(
false
);
}
}
else
{
// not implemented
assert
(
false
);
}
}
// this should be optimized away if input is known
__device__
static
MatrixIndex
GetDistanceFromBeginOfThreadMatrixC
(
index_t
m_in_c
,
index_t
n_in_c
)
{
return
MatrixIndex
{
m_in_c
,
n_in_c
};
}
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
,
class
Accumulator
>
__device__
void
Run
(
const
FloatA
*
__restrict__
p_a_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
*
__restrict__
p_c_thread
,
Accumulator
f_accum
)
const
{
if
(
TransA
&&
(
!
TransB
)
&&
(
!
TransC
))
{
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
constexpr
auto
b_block_mtx
=
BlockMatrixB
{};
constexpr
auto
c_thread_mtx
=
ThreadMatrixC
{};
constexpr
index_t
KPerBlock
=
a_block_mtx
.
NRow
();
// A is transposed
constexpr
index_t
MPerThread
=
c_thread_mtx
.
NRow
();
constexpr
index_t
NPerThread
=
c_thread_mtx
.
NCol
();
// a is transposed, b is not
constexpr
auto
a_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
MPerThread
>
{});
constexpr
auto
b_thread_mtx
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThreadLoop
>
{},
Number
<
NPerThread
>
{});
FloatA
p_a_thread
[
a_thread_mtx
.
GetElementSpace
()];
FloatB
p_b_thread
[
b_thread_mtx
.
GetElementSpace
()];
// loop over k
for
(
index_t
k_begin
=
0
;
k_begin
<
KPerBlock
;
k_begin
+=
KPerThreadLoop
)
{
threadwise_matrix_copy
(
a_block_mtx
,
p_a_block
+
mMyThreadOffsetA
+
k_begin
*
a_block_mtx
.
RowStride
(),
a_thread_mtx
,
p_a_thread
,
a_thread_mtx
.
GetLengths
());
threadwise_matrix_copy
(
b_block_mtx
,
p_b_block
+
mMyThreadOffsetB
+
k_begin
*
b_block_mtx
.
RowStride
(),
b_thread_mtx
,
p_b_thread
,
b_thread_mtx
.
GetLengths
());
threadwise_gemm
(
a_thread_mtx
,
True
,
p_a_thread
,
b_thread_mtx
,
False
,
p_b_thread
,
c_thread_mtx
,
False
,
p_c_thread
,
f_accum
);
}
}
}
};
// if following number are power of 2, index calculation shall be greatly reduced:
// if following number are power of 2, index calculation shall be greatly reduced:
// MPerThreadSubC, NPerThreadSubC, MLevel0Cluster, NLevel0Cluster, MLevel1Cluster, NLevel1Cluster
// MPerThreadSubC, NPerThreadSubC, MLevel0Cluster, NLevel0Cluster, MLevel1Cluster, NLevel1Cluster
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
...
@@ -1149,10 +940,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -1149,10 +940,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
}
}
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
,
class
Accumulator
>
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
,
class
Accumulator
>
__device__
void
Run_
v2
(
const
FloatA
*
__restrict__
p_a_block
,
__device__
void
Run_
PipelineReadAndCompute
(
const
FloatA
*
__restrict__
p_a_block
,
const
FloatB
*
__restrict__
p_b_block
,
const
FloatB
*
__restrict__
p_b_block
,
FloatC
*
__restrict__
p_c_thread
,
FloatC
*
__restrict__
p_c_thread
,
Accumulator
f_accum
)
const
Accumulator
f_accum
)
const
{
{
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment