Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
6de15707
Unverified
Commit
6de15707
authored
Jun 30, 2020
by
gxiaotian
Committed by
GitHub
Jun 30, 2020
Browse files
Add OpEvo example (#2549)
parent
25c4c3b5
Changes
37
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
659 additions
and
0 deletions
+659
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/config_na2c.yml
...s/opevo/src/experiments/mm/N512K1024M1024/config_na2c.yml
+22
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/config_opevo.yml
.../opevo/src/experiments/mm/N512K1024M1024/config_opevo.yml
+25
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/search_space.json
...opevo/src/experiments/mm/N512K1024M1024/search_space.json
+1
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_gbfs.yml
...s/opevo/src/experiments/mm/N512K1024M4096/config_gbfs.yml
+23
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_na2c.yml
...s/opevo/src/experiments/mm/N512K1024M4096/config_na2c.yml
+22
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_opevo.yml
.../opevo/src/experiments/mm/N512K1024M4096/config_opevo.yml
+25
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/search_space.json
...opevo/src/experiments/mm/N512K1024M4096/search_space.json
+1
-0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_gbfs.yml
...s/opevo/src/experiments/mm/N512K4096M1024/config_gbfs.yml
+23
-0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_na2c.yml
...s/opevo/src/experiments/mm/N512K4096M1024/config_na2c.yml
+22
-0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_opevo.yml
.../opevo/src/experiments/mm/N512K4096M1024/config_opevo.yml
+25
-0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/search_space.json
...opevo/src/experiments/mm/N512K4096M1024/search_space.json
+1
-0
examples/trials/systems/opevo/src/run.sh
examples/trials/systems/opevo/src/run.sh
+25
-0
examples/trials/systems/opevo/src/templates/batch_matmul.py
examples/trials/systems/opevo/src/templates/batch_matmul.py
+119
-0
examples/trials/systems/opevo/src/templates/convfwd_direct.py
...ples/trials/systems/opevo/src/templates/convfwd_direct.py
+130
-0
examples/trials/systems/opevo/src/templates/matmul.py
examples/trials/systems/opevo/src/templates/matmul.py
+111
-0
examples/trials/systems/opevo/tvm_patches/libcuda.so.1
examples/trials/systems/opevo/tvm_patches/libcuda.so.1
+0
-0
examples/trials/systems/opevo/tvm_patches/tvm_v0.6.patch
examples/trials/systems/opevo/tvm_patches/tvm_v0.6.patch
+84
-0
No files found.
examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/config_na2c.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K1024M1024_NA2C
trialConcurrency
:
6
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
na2c.py
className
:
N_A2C
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
trial
:
command
:
OP=matmul N=512 K=1024 M=1024 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/config_opevo.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K1024M1024_OPEVO
trialConcurrency
:
8
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
opevo.py
className
:
OpEvo
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
parents_size
:
8
offspring_size
:
8
mutate_rate
:
0.5
trial
:
command
:
OP=matmul N=512 K=1024 M=1024 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/search_space.json
0 → 100644
View file @
6de15707
{
"K"
:
{
"_type"
:
"factor"
,
"_value"
:
[
1024
,
3
]},
"X"
:
{
"_type"
:
"factor"
,
"_value"
:
[
512
,
4
]},
"Y"
:
{
"_type"
:
"factor"
,
"_value"
:
[
1024
,
4
]}}
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_gbfs.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K1024M4096_GBFS
trialConcurrency
:
5
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
gbfs.py
className
:
G_BFS
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
num_samples
:
5
trial
:
command
:
OP=matmul N=512 K=1024 M=4096 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_na2c.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K1024M4096_NA2C
trialConcurrency
:
6
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
na2c.py
className
:
N_A2C
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
trial
:
command
:
OP=matmul N=512 K=1024 M=4096 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_opevo.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K1024M4096_OPEVO
trialConcurrency
:
8
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
opevo.py
className
:
OpEvo
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
parents_size
:
8
offspring_size
:
8
mutate_rate
:
0.5
trial
:
command
:
OP=matmul N=512 K=1024 M=4096 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/search_space.json
0 → 100644
View file @
6de15707
{
"K"
:
{
"_type"
:
"factor"
,
"_value"
:
[
1024
,
3
]},
"X"
:
{
"_type"
:
"factor"
,
"_value"
:
[
512
,
4
]},
"Y"
:
{
"_type"
:
"factor"
,
"_value"
:
[
4096
,
4
]}}
\ No newline at end of file
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_gbfs.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K4096M1024_GBFS
trialConcurrency
:
5
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
gbfs.py
className
:
G_BFS
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
num_samples
:
5
trial
:
command
:
OP=matmul N=512 K=4096 M=1024 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_na2c.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K4096M1024_NA2C
trialConcurrency
:
6
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
na2c.py
className
:
N_A2C
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
trial
:
command
:
OP=matmul N=512 K=4096 M=1024 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_opevo.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K4096M1024_OPEVO
trialConcurrency
:
8
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
opevo.py
className
:
OpEvo
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
parents_size
:
8
offspring_size
:
8
mutate_rate
:
0.5
trial
:
command
:
OP=matmul N=512 K=4096 M=1024 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/search_space.json
0 → 100644
View file @
6de15707
{
"K"
:
{
"_type"
:
"factor"
,
"_value"
:
[
4096
,
3
]},
"X"
:
{
"_type"
:
"factor"
,
"_value"
:
[
512
,
4
]},
"Y"
:
{
"_type"
:
"factor"
,
"_value"
:
[
1024
,
4
]}}
\ No newline at end of file
examples/trials/systems/opevo/src/run.sh
0 → 100644
View file @
6de15707
#!/bin/bash -e
cd
$(
dirname
$0
)
export
BACKEND
=
${
BACKEND
:-
c
-cuda
}
if
[[
"
${
BACKEND
}
"
==
"c-cuda"
]]
;
then
export
BACKEND
=
"#cuda"
fi
if
[[
"
${
BACKEND
}
"
!=
"#cuda"
]]
;
then
export
LD_LIBRARY_PATH
=
/opt/tvm/build
else
export
LD_LIBRARY_PATH
=
/usr/local/nvidia/lib:/usr/local/nvidia/lib64
fi
export
HIP_PLATFORM
=
hcc
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
export
PYTHONDONTWRITEBYTECODE
=
1
export
PYTHONPATH
=
/opt/tvm/python:/opt/tvm/topi/python:/opt/tvm/nnvm/python:/usr/local/rocm/src
ldconfig
time
OP
=
${
OP
:-
matmul
}
S
=
${
S
:-
0
}
python3 ./compiler_auto_tune_stable.py
"
$@
"
examples/trials/systems/opevo/src/templates/batch_matmul.py
0 → 100644
View file @
6de15707
import
numpy
as
np
import
tvm
import
logging
import
sys
,
time
,
subprocess
from
tvm
import
autotvm
import
topi
import
json
from
topi.util
import
get_const_tuple
import
os
op_attributes
=
{
"B"
:
int
(
os
.
environ
[
'B'
])
if
'B'
in
os
.
environ
else
6
,
"N"
:
int
(
os
.
environ
[
'N'
])
if
'N'
in
os
.
environ
else
1024
,
"K"
:
int
(
os
.
environ
[
'K'
])
if
'K'
in
os
.
environ
else
64
,
"M"
:
int
(
os
.
environ
[
'M'
])
if
'M'
in
os
.
environ
else
4096
,
"P"
:
os
.
environ
[
'P'
]
if
'P'
in
os
.
environ
else
"NN"
,
}
@
autotvm
.
template
def
get_template_op
(
**
kargs
):
batch
=
op_attributes
[
"B"
]
M
=
op_attributes
[
"N"
]
K
=
op_attributes
[
"K"
]
N
=
op_attributes
[
"M"
]
pose
=
op_attributes
[
"P"
]
if
pose
==
'NN'
:
A
=
tvm
.
placeholder
((
batch
,
M
,
K
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
batch
,
K
,
N
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
K
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
M
,
N
),
lambda
b
,
i
,
j
:
tvm
.
sum
(
A
[
b
,
i
,
k
]
*
B
[
b
,
k
,
j
],
axis
=
k
),
name
=
'C'
)
elif
pose
==
'NT'
:
A
=
tvm
.
placeholder
((
batch
,
M
,
K
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
batch
,
N
,
K
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
K
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
M
,
N
),
lambda
b
,
i
,
j
:
tvm
.
sum
(
A
[
b
,
i
,
k
]
*
B
[
b
,
j
,
k
],
axis
=
k
),
name
=
'C'
)
elif
pose
==
'TN'
:
A
=
tvm
.
placeholder
((
batch
,
K
,
M
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
batch
,
K
,
N
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
K
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
M
,
N
),
lambda
b
,
i
,
j
:
tvm
.
sum
(
A
[
b
,
k
,
i
]
*
B
[
b
,
k
,
j
],
axis
=
k
),
name
=
'C'
)
elif
pose
==
'TT'
:
A
=
tvm
.
placeholder
((
batch
,
K
,
M
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
batch
,
N
,
K
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
K
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
M
,
N
),
lambda
b
,
i
,
j
:
tvm
.
sum
(
A
[
b
,
k
,
i
]
*
B
[
b
,
j
,
k
],
axis
=
k
),
name
=
'C'
)
else
:
raise
cfg
=
autotvm
.
get_config
()
s
=
tvm
.
create_schedule
(
C
.
op
)
AA
=
s
.
cache_read
(
A
,
"shared"
,
[
C
])
AL
=
s
.
cache_read
(
AA
,
"local"
,
[
C
])
BB
=
s
.
cache_read
(
B
,
"shared"
,
[
C
])
BL
=
s
.
cache_read
(
BB
,
"local"
,
[
C
])
CC
=
s
.
cache_write
(
C
,
"local"
)
b
,
y
,
x
=
C
.
op
.
axis
k
=
CC
.
op
.
reduce_axis
[
0
]
cfg
.
define_split
(
'B'
,
cfg
.
axis
(
b
),
num_outputs
=
2
)
bo
,
bi
=
cfg
[
'B'
].
apply
(
s
,
C
,
b
)
cfg
.
define_split
(
'K'
,
cfg
.
axis
(
k
),
num_outputs
=
3
)
ko
,
kt
,
ki
=
cfg
[
'K'
].
apply
(
s
,
CC
,
k
)
block_x
=
tvm
.
thread_axis
(
'blockIdx.x'
)
block_y
=
tvm
.
thread_axis
(
'blockIdx.y'
)
block_z
=
tvm
.
thread_axis
(
'blockIdx.z'
)
thread_x
=
tvm
.
thread_axis
(
'threadIdx.x'
)
thread_y
=
tvm
.
thread_axis
(
'threadIdx.y'
)
thread_z
=
tvm
.
thread_axis
(
'threadIdx.z'
)
cfg
.
define_split
(
'X'
,
cfg
.
axis
(
y
),
num_outputs
=
4
)
cfg
.
define_split
(
'Y'
,
cfg
.
axis
(
x
),
num_outputs
=
4
)
by
,
tyz
,
ty
,
yi
=
cfg
[
'X'
].
apply
(
s
,
C
,
y
)
bx
,
txz
,
tx
,
xi
=
cfg
[
'Y'
].
apply
(
s
,
C
,
x
)
s
[
C
].
bind
(
bo
,
block_z
)
s
[
C
].
bind
(
by
,
block_y
)
s
[
C
].
bind
(
bx
,
block_x
)
s
[
C
].
bind
(
tyz
,
tvm
.
thread_axis
(
'vthread'
))
s
[
C
].
bind
(
txz
,
tvm
.
thread_axis
(
'vthread'
))
s
[
C
].
bind
(
bi
,
thread_z
)
s
[
C
].
bind
(
ty
,
thread_y
)
s
[
C
].
bind
(
tx
,
thread_x
)
s
[
C
].
reorder
(
by
,
bx
,
tyz
,
txz
,
ty
,
tx
,
yi
,
xi
)
s
[
CC
].
compute_at
(
s
[
C
],
tx
)
bo
,
yo
,
xo
=
CC
.
op
.
axis
s
[
CC
].
reorder
(
ko
,
kt
,
yo
,
xo
,
ki
)
s
[
CC
].
unroll
(
kt
)
for
stage
in
[
AL
,
BL
]:
s
[
stage
].
compute_at
(
s
[
CC
],
kt
)
s
[
stage
].
double_buffer
()
for
stage
in
[
AA
,
BB
]:
s
[
stage
].
compute_at
(
s
[
CC
],
ko
)
fused
=
s
[
stage
].
fuse
(
*
s
[
stage
].
op
.
axis
)
ty
,
tx
=
s
[
stage
].
split
(
fused
,
nparts
=
cfg
[
'X'
].
size
[
2
])
tx
,
xi
=
s
[
stage
].
split
(
tx
,
nparts
=
cfg
[
'Y'
].
size
[
2
])
_
,
xi
=
s
[
stage
].
split
(
xi
,
factor
=
4
)
s
[
stage
].
bind
(
ty
,
thread_y
)
s
[
stage
].
bind
(
tx
,
thread_x
)
s
[
stage
].
vectorize
(
xi
)
s
[
stage
].
double_buffer
()
cfg
.
add_flop
(
batch
*
M
*
K
*
N
*
2.0
)
return
s
,
[
A
,
B
,
C
]
examples/trials/systems/opevo/src/templates/convfwd_direct.py
0 → 100644
View file @
6de15707
import
numpy
as
np
import
tvm
import
logging
import
sys
,
time
,
subprocess
from
tvm
import
autotvm
import
topi
import
json
from
topi.util
import
get_const_tuple
import
os
op_attributes
=
{
"N"
:
int
(
os
.
environ
[
'N'
])
if
'N'
in
os
.
environ
else
64
,
"C"
:
int
(
os
.
environ
[
'C'
])
if
'C'
in
os
.
environ
else
3
,
"H"
:
int
(
os
.
environ
[
'H'
])
if
'H'
in
os
.
environ
else
229
,
"W"
:
int
(
os
.
environ
[
'W'
])
if
'W'
in
os
.
environ
else
229
,
"F"
:
int
(
os
.
environ
[
'F'
])
if
'F'
in
os
.
environ
else
32
,
"K"
:
int
(
os
.
environ
[
'K'
])
if
'K'
in
os
.
environ
else
5
,
"ST"
:
int
(
os
.
environ
[
'ST'
])
if
'ST'
in
os
.
environ
else
1
,
"PD"
:
int
(
os
.
environ
[
'PD'
])
if
'PD'
in
os
.
environ
else
2
,
}
@
autotvm
.
template
def
get_template_op
(
**
kargs
):
N
=
op_attributes
[
"N"
]
CI
=
op_attributes
[
"C"
]
H
=
op_attributes
[
"H"
]
W
=
op_attributes
[
"W"
]
H
=
op_attributes
[
"H"
]
CO
=
op_attributes
[
"F"
]
KH
=
KW
=
op_attributes
[
"K"
]
stride
=
op_attributes
[
"ST"
]
padding
=
op_attributes
[
"PD"
]
dilation
=
1
data
=
tvm
.
placeholder
((
N
,
CI
,
H
,
W
),
name
=
'data'
)
kernel
=
tvm
.
placeholder
((
CO
,
CI
,
KH
,
KW
),
name
=
'kernel'
)
conv
=
topi
.
nn
.
conv2d_nchw
(
data
,
kernel
,
(
stride
,
stride
),
(
padding
,
padding
),
dilation
=
1
,
out_dtype
=
'float32'
)
s
=
tvm
.
create_schedule
([
conv
.
op
])
cfg
=
autotvm
.
get_config
()
##### space definition begin #####
n
,
f
,
y
,
x
=
s
[
conv
].
op
.
axis
rc
,
ry
,
rx
=
s
[
conv
].
op
.
reduce_axis
cfg
.
define_split
(
"tile_f"
,
f
,
num_outputs
=
4
)
cfg
.
define_split
(
"tile_y"
,
y
,
num_outputs
=
4
)
cfg
.
define_split
(
"tile_x"
,
x
,
num_outputs
=
4
)
cfg
.
define_split
(
"tile_rc"
,
rc
,
num_outputs
=
2
)
cfg
.
define_split
(
"tile_ry"
,
ry
,
num_outputs
=
2
)
cfg
.
define_split
(
"tile_rx"
,
rx
,
num_outputs
=
2
)
cfg
.
define_knob
(
"auto_unroll_max_step"
,
[
0
,
125
,
256
])
target
=
tvm
.
target
.
current_target
()
if
target
.
target_name
in
[
'nvptx'
,
'rocm'
]:
cfg
.
define_knob
(
"unroll_explicit"
,
[
1
])
else
:
cfg
.
define_knob
(
"unroll_explicit"
,
[
0
,
1
])
pad_data
,
kernel
=
s
[
conv
].
op
.
input_tensors
s
[
pad_data
].
compute_inline
()
if
isinstance
(
kernel
.
op
,
tvm
.
tensor
.
ComputeOp
)
and
'dilate'
in
kernel
.
op
.
tag
:
s
[
kernel
].
compute_inline
()
if
conv
.
op
in
s
.
outputs
:
output
=
conv
OL
=
s
.
cache_write
(
conv
,
'local'
)
else
:
output
=
s
.
outputs
[
0
].
output
(
0
)
s
[
conv
].
set_scope
(
'local'
)
OL
=
conv
# create cache stage
AA
=
s
.
cache_read
(
pad_data
,
'shared'
,
[
OL
])
WW
=
s
.
cache_read
(
kernel
,
'shared'
,
[
OL
])
# tile and bind spatial axes
n
,
f
,
y
,
x
=
s
[
output
].
op
.
axis
kernel_scope
,
n
=
s
[
output
].
split
(
n
,
nparts
=
1
)
bf
,
vf
,
tf
,
fi
=
cfg
[
"tile_f"
].
apply
(
s
,
output
,
f
)
by
,
vy
,
ty
,
yi
=
cfg
[
"tile_y"
].
apply
(
s
,
output
,
y
)
bx
,
vx
,
tx
,
xi
=
cfg
[
"tile_x"
].
apply
(
s
,
output
,
x
)
bf
=
s
[
output
].
fuse
(
n
,
bf
)
s
[
output
].
bind
(
bf
,
tvm
.
thread_axis
(
"blockIdx.z"
))
s
[
output
].
bind
(
by
,
tvm
.
thread_axis
(
"blockIdx.y"
))
s
[
output
].
bind
(
bx
,
tvm
.
thread_axis
(
"blockIdx.x"
))
s
[
output
].
bind
(
vf
,
tvm
.
thread_axis
(
"vthread"
))
s
[
output
].
bind
(
vy
,
tvm
.
thread_axis
(
"vthread"
))
s
[
output
].
bind
(
vx
,
tvm
.
thread_axis
(
"vthread"
))
s
[
output
].
bind
(
tf
,
tvm
.
thread_axis
(
"threadIdx.z"
))
s
[
output
].
bind
(
ty
,
tvm
.
thread_axis
(
"threadIdx.y"
))
s
[
output
].
bind
(
tx
,
tvm
.
thread_axis
(
"threadIdx.x"
))
s
[
output
].
reorder
(
bf
,
by
,
bx
,
vf
,
vy
,
vx
,
tf
,
ty
,
tx
,
fi
,
yi
,
xi
)
s
[
OL
].
compute_at
(
s
[
output
],
tx
)
# tile reduction axes
n
,
f
,
y
,
x
=
s
[
OL
].
op
.
axis
rc
,
ry
,
rx
=
s
[
OL
].
op
.
reduce_axis
rco
,
rci
=
cfg
[
'tile_rc'
].
apply
(
s
,
OL
,
rc
)
ryo
,
ryi
=
cfg
[
'tile_rx'
].
apply
(
s
,
OL
,
ry
)
rxo
,
rxi
=
cfg
[
'tile_ry'
].
apply
(
s
,
OL
,
rx
)
s
[
OL
].
reorder
(
rco
,
ryo
,
rxo
,
rci
,
ryi
,
rxi
,
n
,
f
,
y
,
x
)
s
[
AA
].
compute_at
(
s
[
OL
],
rxo
)
s
[
WW
].
compute_at
(
s
[
OL
],
rxo
)
# cooperative fetching
for
load
in
[
AA
,
WW
]:
n
,
f
,
y
,
x
=
s
[
load
].
op
.
axis
fused
=
s
[
load
].
fuse
(
n
,
f
,
y
,
x
)
tz
,
fused
=
s
[
load
].
split
(
fused
,
nparts
=
cfg
[
"tile_f"
].
size
[
2
])
ty
,
fused
=
s
[
load
].
split
(
fused
,
nparts
=
cfg
[
"tile_y"
].
size
[
2
])
tx
,
fused
=
s
[
load
].
split
(
fused
,
nparts
=
cfg
[
"tile_x"
].
size
[
2
])
s
[
load
].
bind
(
tz
,
tvm
.
thread_axis
(
"threadIdx.z"
))
s
[
load
].
bind
(
ty
,
tvm
.
thread_axis
(
"threadIdx.y"
))
s
[
load
].
bind
(
tx
,
tvm
.
thread_axis
(
"threadIdx.x"
))
# unroll
s
[
output
].
pragma
(
kernel_scope
,
'auto_unroll_max_step'
,
cfg
[
'auto_unroll_max_step'
].
val
)
s
[
output
].
pragma
(
kernel_scope
,
'unroll_explicit'
,
cfg
[
'unroll_explicit'
].
val
)
N
,
CO
,
OH
,
OW
=
get_const_tuple
(
output
.
shape
)
_
,
KH
,
KW
,
CI
=
get_const_tuple
(
kernel
.
shape
)
cfg
.
add_flop
(
2
*
N
*
OH
*
OW
*
CO
*
CI
*
KH
*
KW
)
return
s
,
[
data
,
kernel
,
conv
]
examples/trials/systems/opevo/src/templates/matmul.py
0 → 100644
View file @
6de15707
import
numpy
as
np
import
tvm
import
logging
import
sys
,
time
,
subprocess
from
tvm
import
autotvm
import
topi
import
json
from
topi.util
import
get_const_tuple
import
os
op_attributes
=
{
"N"
:
int
(
os
.
environ
[
'N'
])
if
'N'
in
os
.
environ
else
1024
,
"K"
:
int
(
os
.
environ
[
'K'
])
if
'K'
in
os
.
environ
else
64
,
"M"
:
int
(
os
.
environ
[
'M'
])
if
'M'
in
os
.
environ
else
4096
,
"P"
:
os
.
environ
[
'P'
]
if
'P'
in
os
.
environ
else
"NN"
,
}
@
autotvm
.
template
def
get_template_op
(
**
kargs
):
batch
=
op_attributes
[
"N"
]
in_dim
=
op_attributes
[
"K"
]
out_dim
=
op_attributes
[
"M"
]
pose
=
op_attributes
[
"P"
]
if
pose
==
'NN'
:
A
=
tvm
.
placeholder
((
batch
,
in_dim
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
in_dim
,
out_dim
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
in_dim
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
out_dim
),
lambda
i
,
j
:
tvm
.
sum
(
A
[
i
,
k
]
*
B
[
k
,
j
],
axis
=
k
),
name
=
'C'
)
elif
pose
==
'NT'
:
A
=
tvm
.
placeholder
((
batch
,
in_dim
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
out_dim
,
in_dim
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
in_dim
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
out_dim
),
lambda
i
,
j
:
tvm
.
sum
(
A
[
i
,
k
]
*
B
[
j
,
k
],
axis
=
k
),
name
=
'C'
)
elif
pose
==
'TN'
:
A
=
tvm
.
placeholder
((
in_dim
,
batch
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
in_dim
,
out_dim
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
in_dim
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
out_dim
),
lambda
i
,
j
:
tvm
.
sum
(
A
[
k
,
i
]
*
B
[
k
,
j
],
axis
=
k
),
name
=
'C'
)
elif
pose
==
'TT'
:
A
=
tvm
.
placeholder
((
in_dim
,
batch
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
out_dim
,
in_dim
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
in_dim
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
out_dim
),
lambda
i
,
j
:
tvm
.
sum
(
A
[
k
,
i
]
*
B
[
j
,
k
],
axis
=
k
),
name
=
'C'
)
else
:
raise
cfg
=
autotvm
.
get_config
()
s
=
tvm
.
create_schedule
(
C
.
op
)
cfg
.
add_flop
(
batch
*
in_dim
*
out_dim
*
2.0
)
AA
=
s
.
cache_read
(
A
,
"shared"
,
[
C
])
AL
=
s
.
cache_read
(
AA
,
"local"
,
[
C
])
BB
=
s
.
cache_read
(
B
,
"shared"
,
[
C
])
BL
=
s
.
cache_read
(
BB
,
"local"
,
[
C
])
CC
=
s
.
cache_write
(
C
,
"local"
)
y
,
x
=
C
.
op
.
axis
k
=
CC
.
op
.
reduce_axis
[
0
]
cfg
.
define_split
(
'K'
,
cfg
.
axis
(
k
),
num_outputs
=
3
)
cfg
.
define_split
(
'X'
,
cfg
.
axis
(
y
),
num_outputs
=
4
)
cfg
.
define_split
(
'Y'
,
cfg
.
axis
(
x
),
num_outputs
=
4
)
ko
,
kt
,
ki
=
cfg
[
'K'
].
apply
(
s
,
CC
,
k
)
block_x
=
tvm
.
thread_axis
(
'blockIdx.x'
)
block_y
=
tvm
.
thread_axis
(
'blockIdx.y'
)
thread_x
=
tvm
.
thread_axis
(
'threadIdx.x'
)
thread_y
=
tvm
.
thread_axis
(
'threadIdx.y'
)
by
,
tyz
,
ty
,
yi
=
cfg
[
'X'
].
apply
(
s
,
C
,
y
)
bx
,
txz
,
tx
,
xi
=
cfg
[
'Y'
].
apply
(
s
,
C
,
x
)
s
[
C
].
bind
(
by
,
block_y
)
s
[
C
].
bind
(
bx
,
block_x
)
s
[
C
].
bind
(
tyz
,
tvm
.
thread_axis
(
'vthread'
))
s
[
C
].
bind
(
txz
,
tvm
.
thread_axis
(
'vthread'
))
s
[
C
].
bind
(
ty
,
thread_y
)
s
[
C
].
bind
(
tx
,
thread_x
)
s
[
C
].
reorder
(
by
,
bx
,
tyz
,
txz
,
ty
,
tx
,
yi
,
xi
)
s
[
CC
].
compute_at
(
s
[
C
],
tx
)
yo
,
xo
=
CC
.
op
.
axis
s
[
CC
].
reorder
(
ko
,
kt
,
yo
,
xo
,
ki
)
s
[
CC
].
unroll
(
kt
)
for
stage
in
[
AL
,
BL
]:
s
[
stage
].
compute_at
(
s
[
CC
],
kt
)
for
stage
in
[
AA
,
BB
]:
s
[
stage
].
compute_at
(
s
[
CC
],
ko
)
fused
=
s
[
stage
].
fuse
(
*
s
[
stage
].
op
.
axis
)
ty
,
tx
=
s
[
stage
].
split
(
fused
,
nparts
=
cfg
[
'X'
].
size
[
2
])
tx
,
xi
=
s
[
stage
].
split
(
tx
,
nparts
=
cfg
[
'Y'
].
size
[
2
])
_
,
xi
=
s
[
stage
].
split
(
xi
,
factor
=
4
)
s
[
stage
].
bind
(
ty
,
thread_y
)
s
[
stage
].
bind
(
tx
,
thread_x
)
s
[
stage
].
vectorize
(
xi
)
s
[
stage
].
double_buffer
()
return
s
,
[
A
,
B
,
C
]
examples/trials/systems/opevo/tvm_patches/libcuda.so.1
0 → 100644
View file @
6de15707
File added
examples/trials/systems/opevo/tvm_patches/tvm_v0.6.patch
0 → 100644
View file @
6de15707
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 76d088f4c..7ed4ff02a 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -122,7 +122,7 @@
class Tuner(object):
configs = self.next_batch(min(n_parallel, n_trial - i))
inputs = [MeasureInput(self.task.target, self.task, config) for config in configs]
- results = measure_batch(inputs)
+ results = self.parse_configs(self.task, configs) if hasattr(self, 'parse_configs') else measure_batch(inputs)
# keep best config
for k, (inp, res) in enumerate(zip(inputs, results)):
diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index eab542dd3..2f1a11303 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -808,6 +808,7 @@
void CodeGenC::VisitStmt_(const AttrStmt* op) {
IterVar iv = Downcast<IterVar>(op->node);
if (iv->thread_tag.length() != 0) {
if (!var_idmap_.count(iv->var.get())) {
+ this->currentOp = op;
BindThreadIndex(iv);
}
}
diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h
index 8701cda1e..7d3d56ddc 100644
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -174,6 +174,8 @@
class CodeGenC :
// Get a cast type from to
virtual std::string CastFromTo(std::string value, Type from, Type target);
+ const AttrStmt* currentOp;
+
protected:
// Print reference to struct location
std::string GetStructRef(
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 6656fa077..a4f0f962d 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -106,6 +106,9 @@
void CodeGenCUDA::BindThreadIndex(const IterVar& iv) {
CHECK(!var_idmap_.count(iv->var.get()));
var_idmap_[iv->var.get()] =
CastFromTo(iv->thread_tag, UInt(32), iv->var.type());
+ int nthread = static_cast<int>(this->currentOp->value.as<IntImm>()->value);
+ if (iv->thread_tag.find("threadIdx.") == 0 || iv->thread_tag.find("blockIdx.") == 0)
+ this->stream << " // [thread_extent] " << iv->thread_tag << " = " << nthread << "\n";
}
void CodeGenCUDA::PrintType(Type t, std::ostream& os) { // NOLINT(*)
diff --git a/src/codegen/opt/build_cuda_on.cc b/src/codegen/opt/build_cuda_on.cc
index 1992ac5d9..9b0ff4cd9 100644
--- a/src/codegen/opt/build_cuda_on.cc
+++ b/src/codegen/opt/build_cuda_on.cc
@@ -137,6 +137,9 @@
runtime::Module BuildCUDA(Array<LoweredFunc> funcs) {
cg.AddFunction(f);
}
std::string code = cg.Finish();
+ const auto* backendproc = Registry::Get("tvm_callback_backend_proc");
+ if (backendproc)
+ return CUDAModuleCreate((*backendproc)(code).operator std::string(), "cubin", ExtractFuncInfo(funcs), code);
if (const auto* f = Registry::Get("tvm_callback_cuda_postproc")) {
code = (*f)(code).operator std::string();
diff --git a/src/lang/expr_operator.cc b/src/lang/expr_operator.cc
index 220d4378c..cc435d138 100644
--- a/src/lang/expr_operator.cc
+++ b/src/lang/expr_operator.cc
@@ -208,11 +208,11 @@
Expr operator%(Expr a, Expr b) {
// TODO(tqchen): switch to floordiv
Expr indexdiv(Expr a, Expr b) {
- return floordiv(a, b);
+ return truncdiv(a, b);
}
Expr indexmod(Expr a, Expr b) {
- return floormod(a, b);
+ return truncmod(a, b);
}
Expr floordiv(Expr a, Expr b) {
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment