Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
60d30627
Commit
60d30627
authored
Mar 22, 2019
by
Jing Zhang
Browse files
added inline assembly into thread-wise gemm
parent
2c9b8c24
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
27 additions
and
46 deletions
+27
-46
build/cmake-cuda.sh
build/cmake-cuda.sh
+0
-25
build/cmake-hip.sh
build/cmake-hip.sh
+0
-16
src/include/threadwise_gemm.hip.hpp
src/include/threadwise_gemm.hip.hpp
+27
-5
No files found.
build/cmake-cuda.sh
deleted
100755 → 0
View file @
2c9b8c24
#!/bin/bash
rm
-f
CMakeCache.txt
rm
-f
*
.cmake
rm
-rf
CMakeFiles
MY_PROJECT_SOURCE
=
/home/chao/code/modular_convolution
MY_PROJECT_INSTALL
=
../install.dir
cmake
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_CXX_COMPILER
=
clang++
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
DEVICE_BACKEND
=
CUDA
\
-D
BOOST_ROOT
=
"/package/install/boost_1.67.0"
\
-D
CUDA_COMMON_INCLUDE_DIR
=
"/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc"
\
-D
CMAKE_CUDA_FLAGS
=
"-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61"
\
${
MY_PROJECT_SOURCE
}
#-D CMAKE_CUDA_COMPILER="/package/install/cuda_10.0/bin/nvcc" \
#-D CMAKE_CUDA_FLAGS="-G -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \
#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61 -Xptxas -v -maxrregcount=128" \
build/cmake-hip.sh
deleted
100755 → 0
View file @
2c9b8c24
#!/bin/bash
rm
-f
CMakeCache.txt
rm
-f
*
.cmake
rm
-rf
CMakeFiles
MY_PROJECT_SOURCE
=
/home/chao/code/modular_convolution
MY_PROJECT_INSTALL
=
../install.dir
cmake
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
DEVICE_BACKEND
=
"HIP"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
${
MY_PROJECT_SOURCE
}
src/include/threadwise_gemm.hip.hpp
View file @
60d30627
...
@@ -53,17 +53,39 @@ __device__ void threadwise_gemm(MatrixA,
...
@@ -53,17 +53,39 @@ __device__ void threadwise_gemm(MatrixA,
constexpr
unsigned
N
=
c_mtx
.
NCol
();
constexpr
unsigned
N
=
c_mtx
.
NCol
();
constexpr
unsigned
K
=
a_mtx
.
NRow
();
// A is transposed
constexpr
unsigned
K
=
a_mtx
.
NRow
();
// A is transposed
assert
(
M
==
8
);
assert
(
N
==
8
);
assert
(
K
==
1
);
for
(
unsigned
k
=
0
;
k
<
K
;
++
k
)
for
(
unsigned
k
=
0
;
k
<
K
;
++
k
)
{
{
const
unsigned
bindex
=
b_mtx
.
Get1dIndex
(
k
,
0
);
for
(
unsigned
i
=
0
;
i
<
M
;
++
i
)
for
(
unsigned
i
=
0
;
i
<
M
;
++
i
)
{
{
for
(
unsigned
j
=
0
;
j
<
N
;
++
j
)
const
unsigned
aindex
=
a_mtx
.
Get1dIndex
(
k
,
i
);
// A is transposed
const
unsigned
cindex
=
c_mtx
.
Get1dIndex
(
i
,
0
);
//N = 8
//for(unsigned j = 0; j < N; ++j)
{
{
const
unsigned
a
index
=
a
_mtx
.
Get1dIndex
(
k
,
i
);
// A is transposed
//
const unsigned
b
index =
b
_mtx.Get1dIndex(k,
j
);
const
unsigned
b
index
=
b
_mtx
.
Get1dIndex
(
k
,
j
);
//
const unsigned
c
index =
c
_mtx.Get1dIndex(
i
, j);
const
unsigned
cindex
=
c_mtx
.
Get1dIndex
(
i
,
j
);
//f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]
);
f_accum
(
p_c_thread
[
cindex
],
p_a_thread
[
aindex
]
*
p_b_thread
[
bindex
]);
asm
volatile
(
"
\n
\
v_mac_f32 %0, %8, %9
\n
\
v_mac_f32 %1, %8, %10
\n
\
v_mac_f32 %2, %8, %11
\n
\
v_mac_f32 %3, %8, %12
\n
\
v_mac_f32 %4, %8, %13
\n
\
v_mac_f32 %5, %8, %14
\n
\
v_mac_f32 %6, %8, %15
\n
\
v_mac_f32 %7, %8, %16
\n
\
"
:
"=v"
(
p_c_thread
[
cindex
+
0
]),
"=v"
(
p_c_thread
[
cindex
+
1
]),
"=v"
(
p_c_thread
[
cindex
+
2
]),
"=v"
(
p_c_thread
[
cindex
+
3
]),
"=v"
(
p_c_thread
[
cindex
+
4
]),
"=v"
(
p_c_thread
[
cindex
+
5
]),
"=v"
(
p_c_thread
[
cindex
+
6
]),
"=v"
(
p_c_thread
[
cindex
+
7
])
:
"v"
(
p_a_thread
[
aindex
]),
"v"
(
p_b_thread
[
bindex
+
0
]),
"v"
(
p_b_thread
[
bindex
+
1
]),
"v"
(
p_b_thread
[
bindex
+
2
]),
"v"
(
p_b_thread
[
bindex
+
3
]),
"v"
(
p_b_thread
[
bindex
+
4
]),
"v"
(
p_b_thread
[
bindex
+
5
]),
"v"
(
p_b_thread
[
bindex
+
6
]),
"v"
(
p_b_thread
[
bindex
+
7
]),
"0"
(
p_c_thread
[
cindex
+
0
]),
"1"
(
p_c_thread
[
cindex
+
1
]),
"2"
(
p_c_thread
[
cindex
+
2
]),
"3"
(
p_c_thread
[
cindex
+
3
]),
"4"
(
p_c_thread
[
cindex
+
4
]),
"5"
(
p_c_thread
[
cindex
+
5
]),
"6"
(
p_c_thread
[
cindex
+
6
]),
"7"
(
p_c_thread
[
cindex
+
7
])
);
}
}
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment