Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
6de15707
Unverified
Commit
6de15707
authored
Jun 30, 2020
by
gxiaotian
Committed by
GitHub
Jun 30, 2020
Browse files
Add OpEvo example (#2549)
parent
25c4c3b5
Changes
37
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
659 additions
and
0 deletions
+659
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/config_na2c.yml
...s/opevo/src/experiments/mm/N512K1024M1024/config_na2c.yml
+22
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/config_opevo.yml
.../opevo/src/experiments/mm/N512K1024M1024/config_opevo.yml
+25
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/search_space.json
...opevo/src/experiments/mm/N512K1024M1024/search_space.json
+1
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_gbfs.yml
...s/opevo/src/experiments/mm/N512K1024M4096/config_gbfs.yml
+23
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_na2c.yml
...s/opevo/src/experiments/mm/N512K1024M4096/config_na2c.yml
+22
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_opevo.yml
.../opevo/src/experiments/mm/N512K1024M4096/config_opevo.yml
+25
-0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/search_space.json
...opevo/src/experiments/mm/N512K1024M4096/search_space.json
+1
-0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_gbfs.yml
...s/opevo/src/experiments/mm/N512K4096M1024/config_gbfs.yml
+23
-0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_na2c.yml
...s/opevo/src/experiments/mm/N512K4096M1024/config_na2c.yml
+22
-0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_opevo.yml
.../opevo/src/experiments/mm/N512K4096M1024/config_opevo.yml
+25
-0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/search_space.json
...opevo/src/experiments/mm/N512K4096M1024/search_space.json
+1
-0
examples/trials/systems/opevo/src/run.sh
examples/trials/systems/opevo/src/run.sh
+25
-0
examples/trials/systems/opevo/src/templates/batch_matmul.py
examples/trials/systems/opevo/src/templates/batch_matmul.py
+119
-0
examples/trials/systems/opevo/src/templates/convfwd_direct.py
...ples/trials/systems/opevo/src/templates/convfwd_direct.py
+130
-0
examples/trials/systems/opevo/src/templates/matmul.py
examples/trials/systems/opevo/src/templates/matmul.py
+111
-0
examples/trials/systems/opevo/tvm_patches/libcuda.so.1
examples/trials/systems/opevo/tvm_patches/libcuda.so.1
+0
-0
examples/trials/systems/opevo/tvm_patches/tvm_v0.6.patch
examples/trials/systems/opevo/tvm_patches/tvm_v0.6.patch
+84
-0
No files found.
examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/config_na2c.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K1024M1024_NA2C
trialConcurrency
:
6
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
na2c.py
className
:
N_A2C
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
trial
:
command
:
OP=matmul N=512 K=1024 M=1024 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/config_opevo.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K1024M1024_OPEVO
trialConcurrency
:
8
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
opevo.py
className
:
OpEvo
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
parents_size
:
8
offspring_size
:
8
mutate_rate
:
0.5
trial
:
command
:
OP=matmul N=512 K=1024 M=1024 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/search_space.json
0 → 100644
View file @
6de15707
{
"K"
:
{
"_type"
:
"factor"
,
"_value"
:
[
1024
,
3
]},
"X"
:
{
"_type"
:
"factor"
,
"_value"
:
[
512
,
4
]},
"Y"
:
{
"_type"
:
"factor"
,
"_value"
:
[
1024
,
4
]}}
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_gbfs.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K1024M4096_GBFS
trialConcurrency
:
5
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
gbfs.py
className
:
G_BFS
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
num_samples
:
5
trial
:
command
:
OP=matmul N=512 K=1024 M=4096 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_na2c.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K1024M4096_NA2C
trialConcurrency
:
6
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
na2c.py
className
:
N_A2C
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
trial
:
command
:
OP=matmul N=512 K=1024 M=4096 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_opevo.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K1024M4096_OPEVO
trialConcurrency
:
8
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
opevo.py
className
:
OpEvo
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
parents_size
:
8
offspring_size
:
8
mutate_rate
:
0.5
trial
:
command
:
OP=matmul N=512 K=1024 M=4096 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/search_space.json
0 → 100644
View file @
6de15707
{
"K"
:
{
"_type"
:
"factor"
,
"_value"
:
[
1024
,
3
]},
"X"
:
{
"_type"
:
"factor"
,
"_value"
:
[
512
,
4
]},
"Y"
:
{
"_type"
:
"factor"
,
"_value"
:
[
4096
,
4
]}}
\ No newline at end of file
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_gbfs.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K4096M1024_GBFS
trialConcurrency
:
5
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
gbfs.py
className
:
G_BFS
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
num_samples
:
5
trial
:
command
:
OP=matmul N=512 K=4096 M=1024 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_na2c.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K4096M1024_NA2C
trialConcurrency
:
6
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
na2c.py
className
:
N_A2C
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
trial
:
command
:
OP=matmul N=512 K=4096 M=1024 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_opevo.yml
0 → 100644
View file @
6de15707
authorName
:
default
experimentName
:
MatMul_N512K4096M1024_OPEVO
trialConcurrency
:
8
maxExecDuration
:
24h
maxTrialNum
:
512
#choice: local, remote, pai
trainingServicePlatform
:
local
searchSpacePath
:
search_space.json
#choice: true, false
useAnnotation
:
false
tuner
:
codeDir
:
/root/algorithms/
classFileName
:
opevo.py
className
:
OpEvo
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs
:
optimize_mode
:
maximize
parents_size
:
8
offspring_size
:
8
mutate_rate
:
0.5
trial
:
command
:
OP=matmul N=512 K=4096 M=1024 P=NN ./run.sh
codeDir
:
/root
# gpuNum: 0
examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/search_space.json
0 → 100644
View file @
6de15707
{
"K"
:
{
"_type"
:
"factor"
,
"_value"
:
[
4096
,
3
]},
"X"
:
{
"_type"
:
"factor"
,
"_value"
:
[
512
,
4
]},
"Y"
:
{
"_type"
:
"factor"
,
"_value"
:
[
1024
,
4
]}}
\ No newline at end of file
examples/trials/systems/opevo/src/run.sh
0 → 100644
View file @
6de15707
#!/bin/bash -e
cd
$(
dirname
$0
)
export
BACKEND
=
${
BACKEND
:-
c
-cuda
}
if
[[
"
${
BACKEND
}
"
==
"c-cuda"
]]
;
then
export
BACKEND
=
"#cuda"
fi
if
[[
"
${
BACKEND
}
"
!=
"#cuda"
]]
;
then
export
LD_LIBRARY_PATH
=
/opt/tvm/build
else
export
LD_LIBRARY_PATH
=
/usr/local/nvidia/lib:/usr/local/nvidia/lib64
fi
export
HIP_PLATFORM
=
hcc
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
export
PYTHONDONTWRITEBYTECODE
=
1
export
PYTHONPATH
=
/opt/tvm/python:/opt/tvm/topi/python:/opt/tvm/nnvm/python:/usr/local/rocm/src
ldconfig
time
OP
=
${
OP
:-
matmul
}
S
=
${
S
:-
0
}
python3 ./compiler_auto_tune_stable.py
"
$@
"
examples/trials/systems/opevo/src/templates/batch_matmul.py
0 → 100644
View file @
6de15707
import
numpy
as
np
import
tvm
import
logging
import
sys
,
time
,
subprocess
from
tvm
import
autotvm
import
topi
import
json
from
topi.util
import
get_const_tuple
import
os
op_attributes
=
{
"B"
:
int
(
os
.
environ
[
'B'
])
if
'B'
in
os
.
environ
else
6
,
"N"
:
int
(
os
.
environ
[
'N'
])
if
'N'
in
os
.
environ
else
1024
,
"K"
:
int
(
os
.
environ
[
'K'
])
if
'K'
in
os
.
environ
else
64
,
"M"
:
int
(
os
.
environ
[
'M'
])
if
'M'
in
os
.
environ
else
4096
,
"P"
:
os
.
environ
[
'P'
]
if
'P'
in
os
.
environ
else
"NN"
,
}
@
autotvm
.
template
def
get_template_op
(
**
kargs
):
batch
=
op_attributes
[
"B"
]
M
=
op_attributes
[
"N"
]
K
=
op_attributes
[
"K"
]
N
=
op_attributes
[
"M"
]
pose
=
op_attributes
[
"P"
]
if
pose
==
'NN'
:
A
=
tvm
.
placeholder
((
batch
,
M
,
K
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
batch
,
K
,
N
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
K
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
M
,
N
),
lambda
b
,
i
,
j
:
tvm
.
sum
(
A
[
b
,
i
,
k
]
*
B
[
b
,
k
,
j
],
axis
=
k
),
name
=
'C'
)
elif
pose
==
'NT'
:
A
=
tvm
.
placeholder
((
batch
,
M
,
K
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
batch
,
N
,
K
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
K
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
M
,
N
),
lambda
b
,
i
,
j
:
tvm
.
sum
(
A
[
b
,
i
,
k
]
*
B
[
b
,
j
,
k
],
axis
=
k
),
name
=
'C'
)
elif
pose
==
'TN'
:
A
=
tvm
.
placeholder
((
batch
,
K
,
M
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
batch
,
K
,
N
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
K
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
M
,
N
),
lambda
b
,
i
,
j
:
tvm
.
sum
(
A
[
b
,
k
,
i
]
*
B
[
b
,
k
,
j
],
axis
=
k
),
name
=
'C'
)
elif
pose
==
'TT'
:
A
=
tvm
.
placeholder
((
batch
,
K
,
M
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
batch
,
N
,
K
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
K
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
M
,
N
),
lambda
b
,
i
,
j
:
tvm
.
sum
(
A
[
b
,
k
,
i
]
*
B
[
b
,
j
,
k
],
axis
=
k
),
name
=
'C'
)
else
:
raise
cfg
=
autotvm
.
get_config
()
s
=
tvm
.
create_schedule
(
C
.
op
)
AA
=
s
.
cache_read
(
A
,
"shared"
,
[
C
])
AL
=
s
.
cache_read
(
AA
,
"local"
,
[
C
])
BB
=
s
.
cache_read
(
B
,
"shared"
,
[
C
])
BL
=
s
.
cache_read
(
BB
,
"local"
,
[
C
])
CC
=
s
.
cache_write
(
C
,
"local"
)
b
,
y
,
x
=
C
.
op
.
axis
k
=
CC
.
op
.
reduce_axis
[
0
]
cfg
.
define_split
(
'B'
,
cfg
.
axis
(
b
),
num_outputs
=
2
)
bo
,
bi
=
cfg
[
'B'
].
apply
(
s
,
C
,
b
)
cfg
.
define_split
(
'K'
,
cfg
.
axis
(
k
),
num_outputs
=
3
)
ko
,
kt
,
ki
=
cfg
[
'K'
].
apply
(
s
,
CC
,
k
)
block_x
=
tvm
.
thread_axis
(
'blockIdx.x'
)
block_y
=
tvm
.
thread_axis
(
'blockIdx.y'
)
block_z
=
tvm
.
thread_axis
(
'blockIdx.z'
)
thread_x
=
tvm
.
thread_axis
(
'threadIdx.x'
)
thread_y
=
tvm
.
thread_axis
(
'threadIdx.y'
)
thread_z
=
tvm
.
thread_axis
(
'threadIdx.z'
)
cfg
.
define_split
(
'X'
,
cfg
.
axis
(
y
),
num_outputs
=
4
)
cfg
.
define_split
(
'Y'
,
cfg
.
axis
(
x
),
num_outputs
=
4
)
by
,
tyz
,
ty
,
yi
=
cfg
[
'X'
].
apply
(
s
,
C
,
y
)
bx
,
txz
,
tx
,
xi
=
cfg
[
'Y'
].
apply
(
s
,
C
,
x
)
s
[
C
].
bind
(
bo
,
block_z
)
s
[
C
].
bind
(
by
,
block_y
)
s
[
C
].
bind
(
bx
,
block_x
)
s
[
C
].
bind
(
tyz
,
tvm
.
thread_axis
(
'vthread'
))
s
[
C
].
bind
(
txz
,
tvm
.
thread_axis
(
'vthread'
))
s
[
C
].
bind
(
bi
,
thread_z
)
s
[
C
].
bind
(
ty
,
thread_y
)
s
[
C
].
bind
(
tx
,
thread_x
)
s
[
C
].
reorder
(
by
,
bx
,
tyz
,
txz
,
ty
,
tx
,
yi
,
xi
)
s
[
CC
].
compute_at
(
s
[
C
],
tx
)
bo
,
yo
,
xo
=
CC
.
op
.
axis
s
[
CC
].
reorder
(
ko
,
kt
,
yo
,
xo
,
ki
)
s
[
CC
].
unroll
(
kt
)
for
stage
in
[
AL
,
BL
]:
s
[
stage
].
compute_at
(
s
[
CC
],
kt
)
s
[
stage
].
double_buffer
()
for
stage
in
[
AA
,
BB
]:
s
[
stage
].
compute_at
(
s
[
CC
],
ko
)
fused
=
s
[
stage
].
fuse
(
*
s
[
stage
].
op
.
axis
)
ty
,
tx
=
s
[
stage
].
split
(
fused
,
nparts
=
cfg
[
'X'
].
size
[
2
])
tx
,
xi
=
s
[
stage
].
split
(
tx
,
nparts
=
cfg
[
'Y'
].
size
[
2
])
_
,
xi
=
s
[
stage
].
split
(
xi
,
factor
=
4
)
s
[
stage
].
bind
(
ty
,
thread_y
)
s
[
stage
].
bind
(
tx
,
thread_x
)
s
[
stage
].
vectorize
(
xi
)
s
[
stage
].
double_buffer
()
cfg
.
add_flop
(
batch
*
M
*
K
*
N
*
2.0
)
return
s
,
[
A
,
B
,
C
]
examples/trials/systems/opevo/src/templates/convfwd_direct.py
0 → 100644
View file @
6de15707
import
numpy
as
np
import
tvm
import
logging
import
sys
,
time
,
subprocess
from
tvm
import
autotvm
import
topi
import
json
from
topi.util
import
get_const_tuple
import
os
op_attributes
=
{
"N"
:
int
(
os
.
environ
[
'N'
])
if
'N'
in
os
.
environ
else
64
,
"C"
:
int
(
os
.
environ
[
'C'
])
if
'C'
in
os
.
environ
else
3
,
"H"
:
int
(
os
.
environ
[
'H'
])
if
'H'
in
os
.
environ
else
229
,
"W"
:
int
(
os
.
environ
[
'W'
])
if
'W'
in
os
.
environ
else
229
,
"F"
:
int
(
os
.
environ
[
'F'
])
if
'F'
in
os
.
environ
else
32
,
"K"
:
int
(
os
.
environ
[
'K'
])
if
'K'
in
os
.
environ
else
5
,
"ST"
:
int
(
os
.
environ
[
'ST'
])
if
'ST'
in
os
.
environ
else
1
,
"PD"
:
int
(
os
.
environ
[
'PD'
])
if
'PD'
in
os
.
environ
else
2
,
}
@
autotvm
.
template
def
get_template_op
(
**
kargs
):
N
=
op_attributes
[
"N"
]
CI
=
op_attributes
[
"C"
]
H
=
op_attributes
[
"H"
]
W
=
op_attributes
[
"W"
]
H
=
op_attributes
[
"H"
]
CO
=
op_attributes
[
"F"
]
KH
=
KW
=
op_attributes
[
"K"
]
stride
=
op_attributes
[
"ST"
]
padding
=
op_attributes
[
"PD"
]
dilation
=
1
data
=
tvm
.
placeholder
((
N
,
CI
,
H
,
W
),
name
=
'data'
)
kernel
=
tvm
.
placeholder
((
CO
,
CI
,
KH
,
KW
),
name
=
'kernel'
)
conv
=
topi
.
nn
.
conv2d_nchw
(
data
,
kernel
,
(
stride
,
stride
),
(
padding
,
padding
),
dilation
=
1
,
out_dtype
=
'float32'
)
s
=
tvm
.
create_schedule
([
conv
.
op
])
cfg
=
autotvm
.
get_config
()
##### space definition begin #####
n
,
f
,
y
,
x
=
s
[
conv
].
op
.
axis
rc
,
ry
,
rx
=
s
[
conv
].
op
.
reduce_axis
cfg
.
define_split
(
"tile_f"
,
f
,
num_outputs
=
4
)
cfg
.
define_split
(
"tile_y"
,
y
,
num_outputs
=
4
)
cfg
.
define_split
(
"tile_x"
,
x
,
num_outputs
=
4
)
cfg
.
define_split
(
"tile_rc"
,
rc
,
num_outputs
=
2
)
cfg
.
define_split
(
"tile_ry"
,
ry
,
num_outputs
=
2
)
cfg
.
define_split
(
"tile_rx"
,
rx
,
num_outputs
=
2
)
cfg
.
define_knob
(
"auto_unroll_max_step"
,
[
0
,
125
,
256
])
target
=
tvm
.
target
.
current_target
()
if
target
.
target_name
in
[
'nvptx'
,
'rocm'
]:
cfg
.
define_knob
(
"unroll_explicit"
,
[
1
])
else
:
cfg
.
define_knob
(
"unroll_explicit"
,
[
0
,
1
])
pad_data
,
kernel
=
s
[
conv
].
op
.
input_tensors
s
[
pad_data
].
compute_inline
()
if
isinstance
(
kernel
.
op
,
tvm
.
tensor
.
ComputeOp
)
and
'dilate'
in
kernel
.
op
.
tag
:
s
[
kernel
].
compute_inline
()
if
conv
.
op
in
s
.
outputs
:
output
=
conv
OL
=
s
.
cache_write
(
conv
,
'local'
)
else
:
output
=
s
.
outputs
[
0
].
output
(
0
)
s
[
conv
].
set_scope
(
'local'
)
OL
=
conv
# create cache stage
AA
=
s
.
cache_read
(
pad_data
,
'shared'
,
[
OL
])
WW
=
s
.
cache_read
(
kernel
,
'shared'
,
[
OL
])
# tile and bind spatial axes
n
,
f
,
y
,
x
=
s
[
output
].
op
.
axis
kernel_scope
,
n
=
s
[
output
].
split
(
n
,
nparts
=
1
)
bf
,
vf
,
tf
,
fi
=
cfg
[
"tile_f"
].
apply
(
s
,
output
,
f
)
by
,
vy
,
ty
,
yi
=
cfg
[
"tile_y"
].
apply
(
s
,
output
,
y
)
bx
,
vx
,
tx
,
xi
=
cfg
[
"tile_x"
].
apply
(
s
,
output
,
x
)
bf
=
s
[
output
].
fuse
(
n
,
bf
)
s
[
output
].
bind
(
bf
,
tvm
.
thread_axis
(
"blockIdx.z"
))
s
[
output
].
bind
(
by
,
tvm
.
thread_axis
(
"blockIdx.y"
))
s
[
output
].
bind
(
bx
,
tvm
.
thread_axis
(
"blockIdx.x"
))
s
[
output
].
bind
(
vf
,
tvm
.
thread_axis
(
"vthread"
))
s
[
output
].
bind
(
vy
,
tvm
.
thread_axis
(
"vthread"
))
s
[
output
].
bind
(
vx
,
tvm
.
thread_axis
(
"vthread"
))
s
[
output
].
bind
(
tf
,
tvm
.
thread_axis
(
"threadIdx.z"
))
s
[
output
].
bind
(
ty
,
tvm
.
thread_axis
(
"threadIdx.y"
))
s
[
output
].
bind
(
tx
,
tvm
.
thread_axis
(
"threadIdx.x"
))
s
[
output
].
reorder
(
bf
,
by
,
bx
,
vf
,
vy
,
vx
,
tf
,
ty
,
tx
,
fi
,
yi
,
xi
)
s
[
OL
].
compute_at
(
s
[
output
],
tx
)
# tile reduction axes
n
,
f
,
y
,
x
=
s
[
OL
].
op
.
axis
rc
,
ry
,
rx
=
s
[
OL
].
op
.
reduce_axis
rco
,
rci
=
cfg
[
'tile_rc'
].
apply
(
s
,
OL
,
rc
)
ryo
,
ryi
=
cfg
[
'tile_rx'
].
apply
(
s
,
OL
,
ry
)
rxo
,
rxi
=
cfg
[
'tile_ry'
].
apply
(
s
,
OL
,
rx
)
s
[
OL
].
reorder
(
rco
,
ryo
,
rxo
,
rci
,
ryi
,
rxi
,
n
,
f
,
y
,
x
)
s
[
AA
].
compute_at
(
s
[
OL
],
rxo
)
s
[
WW
].
compute_at
(
s
[
OL
],
rxo
)
# cooperative fetching
for
load
in
[
AA
,
WW
]:
n
,
f
,
y
,
x
=
s
[
load
].
op
.
axis
fused
=
s
[
load
].
fuse
(
n
,
f
,
y
,
x
)
tz
,
fused
=
s
[
load
].
split
(
fused
,
nparts
=
cfg
[
"tile_f"
].
size
[
2
])
ty
,
fused
=
s
[
load
].
split
(
fused
,
nparts
=
cfg
[
"tile_y"
].
size
[
2
])
tx
,
fused
=
s
[
load
].
split
(
fused
,
nparts
=
cfg
[
"tile_x"
].
size
[
2
])
s
[
load
].
bind
(
tz
,
tvm
.
thread_axis
(
"threadIdx.z"
))
s
[
load
].
bind
(
ty
,
tvm
.
thread_axis
(
"threadIdx.y"
))
s
[
load
].
bind
(
tx
,
tvm
.
thread_axis
(
"threadIdx.x"
))
# unroll
s
[
output
].
pragma
(
kernel_scope
,
'auto_unroll_max_step'
,
cfg
[
'auto_unroll_max_step'
].
val
)
s
[
output
].
pragma
(
kernel_scope
,
'unroll_explicit'
,
cfg
[
'unroll_explicit'
].
val
)
N
,
CO
,
OH
,
OW
=
get_const_tuple
(
output
.
shape
)
_
,
KH
,
KW
,
CI
=
get_const_tuple
(
kernel
.
shape
)
cfg
.
add_flop
(
2
*
N
*
OH
*
OW
*
CO
*
CI
*
KH
*
KW
)
return
s
,
[
data
,
kernel
,
conv
]
examples/trials/systems/opevo/src/templates/matmul.py
0 → 100644
View file @
6de15707
import
numpy
as
np
import
tvm
import
logging
import
sys
,
time
,
subprocess
from
tvm
import
autotvm
import
topi
import
json
from
topi.util
import
get_const_tuple
import
os
op_attributes
=
{
"N"
:
int
(
os
.
environ
[
'N'
])
if
'N'
in
os
.
environ
else
1024
,
"K"
:
int
(
os
.
environ
[
'K'
])
if
'K'
in
os
.
environ
else
64
,
"M"
:
int
(
os
.
environ
[
'M'
])
if
'M'
in
os
.
environ
else
4096
,
"P"
:
os
.
environ
[
'P'
]
if
'P'
in
os
.
environ
else
"NN"
,
}
@
autotvm
.
template
def
get_template_op
(
**
kargs
):
batch
=
op_attributes
[
"N"
]
in_dim
=
op_attributes
[
"K"
]
out_dim
=
op_attributes
[
"M"
]
pose
=
op_attributes
[
"P"
]
if
pose
==
'NN'
:
A
=
tvm
.
placeholder
((
batch
,
in_dim
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
in_dim
,
out_dim
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
in_dim
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
out_dim
),
lambda
i
,
j
:
tvm
.
sum
(
A
[
i
,
k
]
*
B
[
k
,
j
],
axis
=
k
),
name
=
'C'
)
elif
pose
==
'NT'
:
A
=
tvm
.
placeholder
((
batch
,
in_dim
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
out_dim
,
in_dim
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
in_dim
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
out_dim
),
lambda
i
,
j
:
tvm
.
sum
(
A
[
i
,
k
]
*
B
[
j
,
k
],
axis
=
k
),
name
=
'C'
)
elif
pose
==
'TN'
:
A
=
tvm
.
placeholder
((
in_dim
,
batch
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
in_dim
,
out_dim
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
in_dim
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
out_dim
),
lambda
i
,
j
:
tvm
.
sum
(
A
[
k
,
i
]
*
B
[
k
,
j
],
axis
=
k
),
name
=
'C'
)
elif
pose
==
'TT'
:
A
=
tvm
.
placeholder
((
in_dim
,
batch
),
name
=
'A'
,
dtype
=
"float32"
)
B
=
tvm
.
placeholder
((
out_dim
,
in_dim
),
name
=
'B'
,
dtype
=
"float32"
)
k
=
tvm
.
reduce_axis
((
0
,
in_dim
),
name
=
'k'
)
C
=
tvm
.
compute
((
batch
,
out_dim
),
lambda
i
,
j
:
tvm
.
sum
(
A
[
k
,
i
]
*
B
[
j
,
k
],
axis
=
k
),
name
=
'C'
)
else
:
raise
cfg
=
autotvm
.
get_config
()
s
=
tvm
.
create_schedule
(
C
.
op
)
cfg
.
add_flop
(
batch
*
in_dim
*
out_dim
*
2.0
)
AA
=
s
.
cache_read
(
A
,
"shared"
,
[
C
])
AL
=
s
.
cache_read
(
AA
,
"local"
,
[
C
])
BB
=
s
.
cache_read
(
B
,
"shared"
,
[
C
])
BL
=
s
.
cache_read
(
BB
,
"local"
,
[
C
])
CC
=
s
.
cache_write
(
C
,
"local"
)
y
,
x
=
C
.
op
.
axis
k
=
CC
.
op
.
reduce_axis
[
0
]
cfg
.
define_split
(
'K'
,
cfg
.
axis
(
k
),
num_outputs
=
3
)
cfg
.
define_split
(
'X'
,
cfg
.
axis
(
y
),
num_outputs
=
4
)
cfg
.
define_split
(
'Y'
,
cfg
.
axis
(
x
),
num_outputs
=
4
)
ko
,
kt
,
ki
=
cfg
[
'K'
].
apply
(
s
,
CC
,
k
)
block_x
=
tvm
.
thread_axis
(
'blockIdx.x'
)
block_y
=
tvm
.
thread_axis
(
'blockIdx.y'
)
thread_x
=
tvm
.
thread_axis
(
'threadIdx.x'
)
thread_y
=
tvm
.
thread_axis
(
'threadIdx.y'
)
by
,
tyz
,
ty
,
yi
=
cfg
[
'X'
].
apply
(
s
,
C
,
y
)
bx
,
txz
,
tx
,
xi
=
cfg
[
'Y'
].
apply
(
s
,
C
,
x
)
s
[
C
].
bind
(
by
,
block_y
)
s
[
C
].
bind
(
bx
,
block_x
)
s
[
C
].
bind
(
tyz
,
tvm
.
thread_axis
(
'vthread'
))
s
[
C
].
bind
(
txz
,
tvm
.
thread_axis
(
'vthread'
))
s
[
C
].
bind
(
ty
,
thread_y
)
s
[
C
].
bind
(
tx
,
thread_x
)
s
[
C
].
reorder
(
by
,
bx
,
tyz
,
txz
,
ty
,
tx
,
yi
,
xi
)
s
[
CC
].
compute_at
(
s
[
C
],
tx
)
yo
,
xo
=
CC
.
op
.
axis
s
[
CC
].
reorder
(
ko
,
kt
,
yo
,
xo
,
ki
)
s
[
CC
].
unroll
(
kt
)
for
stage
in
[
AL
,
BL
]:
s
[
stage
].
compute_at
(
s
[
CC
],
kt
)
for
stage
in
[
AA
,
BB
]:
s
[
stage
].
compute_at
(
s
[
CC
],
ko
)
fused
=
s
[
stage
].
fuse
(
*
s
[
stage
].
op
.
axis
)
ty
,
tx
=
s
[
stage
].
split
(
fused
,
nparts
=
cfg
[
'X'
].
size
[
2
])
tx
,
xi
=
s
[
stage
].
split
(
tx
,
nparts
=
cfg
[
'Y'
].
size
[
2
])
_
,
xi
=
s
[
stage
].
split
(
xi
,
factor
=
4
)
s
[
stage
].
bind
(
ty
,
thread_y
)
s
[
stage
].
bind
(
tx
,
thread_x
)
s
[
stage
].
vectorize
(
xi
)
s
[
stage
].
double_buffer
()
return
s
,
[
A
,
B
,
C
]
examples/trials/systems/opevo/tvm_patches/libcuda.so.1
0 → 100644
View file @
6de15707
File added
examples/trials/systems/opevo/tvm_patches/tvm_v0.6.patch
0 → 100644
View file @
6de15707
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 76d088f4c..7ed4ff02a 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -122,7 +122,7 @@
class Tuner(object):
configs = self.next_batch(min(n_parallel, n_trial - i))
inputs = [MeasureInput(self.task.target, self.task, config) for config in configs]
- results = measure_batch(inputs)
+ results = self.parse_configs(self.task, configs) if hasattr(self, 'parse_configs') else measure_batch(inputs)
# keep best config
for k, (inp, res) in enumerate(zip(inputs, results)):
diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index eab542dd3..2f1a11303 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -808,6 +808,7 @@
void CodeGenC::VisitStmt_(const AttrStmt* op) {
IterVar iv = Downcast<IterVar>(op->node);
if (iv->thread_tag.length() != 0) {
if (!var_idmap_.count(iv->var.get())) {
+ this->currentOp = op;
BindThreadIndex(iv);
}
}
diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h
index 8701cda1e..7d3d56ddc 100644
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -174,6 +174,8 @@
class CodeGenC :
// Get a cast type from to
virtual std::string CastFromTo(std::string value, Type from, Type target);
+ const AttrStmt* currentOp;
+
protected:
// Print reference to struct location
std::string GetStructRef(
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 6656fa077..a4f0f962d 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -106,6 +106,9 @@
void CodeGenCUDA::BindThreadIndex(const IterVar& iv) {
CHECK(!var_idmap_.count(iv->var.get()));
var_idmap_[iv->var.get()] =
CastFromTo(iv->thread_tag, UInt(32), iv->var.type());
+ int nthread = static_cast<int>(this->currentOp->value.as<IntImm>()->value);
+ if (iv->thread_tag.find("threadIdx.") == 0 || iv->thread_tag.find("blockIdx.") == 0)
+ this->stream << " // [thread_extent] " << iv->thread_tag << " = " << nthread << "\n";
}
void CodeGenCUDA::PrintType(Type t, std::ostream& os) { // NOLINT(*)
diff --git a/src/codegen/opt/build_cuda_on.cc b/src/codegen/opt/build_cuda_on.cc
index 1992ac5d9..9b0ff4cd9 100644
--- a/src/codegen/opt/build_cuda_on.cc
+++ b/src/codegen/opt/build_cuda_on.cc
@@ -137,6 +137,9 @@
runtime::Module BuildCUDA(Array<LoweredFunc> funcs) {
cg.AddFunction(f);
}
std::string code = cg.Finish();
+ const auto* backendproc = Registry::Get("tvm_callback_backend_proc");
+ if (backendproc)
+ return CUDAModuleCreate((*backendproc)(code).operator std::string(), "cubin", ExtractFuncInfo(funcs), code);
if (const auto* f = Registry::Get("tvm_callback_cuda_postproc")) {
code = (*f)(code).operator std::string();
diff --git a/src/lang/expr_operator.cc b/src/lang/expr_operator.cc
index 220d4378c..cc435d138 100644
--- a/src/lang/expr_operator.cc
+++ b/src/lang/expr_operator.cc
@@ -208,11 +208,11 @@
Expr operator%(Expr a, Expr b) {
// TODO(tqchen): switch to floordiv
Expr indexdiv(Expr a, Expr b) {
- return floordiv(a, b);
+ return truncdiv(a, b);
}
Expr indexmod(Expr a, Expr b) {
- return floormod(a, b);
+ return truncmod(a, b);
}
Expr floordiv(Expr a, Expr b) {
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment