Add OpEvo example (#2549)

6de15707 · gxiaotian · GitHub · 25c4c3b5 · 6de15707 · 6de15707
Unverified Commit 6de15707 authored Jun 30, 2020 by gxiaotian Committed by GitHub Jun 30, 2020
17 changed files
--- a/examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/config_na2c.yml
+++ b/examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/config_na2c.yml
+authorName: default
+experimentName: MatMul_N512K1024M1024_NA2C
+trialConcurrency: 6
+maxExecDuration: 24h
+maxTrialNum: 512
+#choice: local, remote, pai
+trainingServicePlatform: local
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  codeDir: /root/algorithms/
+  classFileName: na2c.py
+  className: N_A2C
+  # Any parameter need to pass to your tuner class __init__ constructor
+  # can be specified in this optional classArgs field, for example 
+  classArgs:
+    optimize_mode: maximize
+trial:
+  command: OP=matmul N=512 K=1024 M=1024 P=NN ./run.sh
+  codeDir: /root
+  # gpuNum: 0
--- a/examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/config_opevo.yml
+++ b/examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/config_opevo.yml
+authorName: default
+experimentName: MatMul_N512K1024M1024_OPEVO
+trialConcurrency: 8
+maxExecDuration: 24h
+maxTrialNum: 512
+#choice: local, remote, pai
+trainingServicePlatform: local
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  codeDir: /root/algorithms/
+  classFileName: opevo.py
+  className: OpEvo
+  # Any parameter need to pass to your tuner class __init__ constructor
+  # can be specified in this optional classArgs field, for example 
+  classArgs:
+    optimize_mode: maximize
+    parents_size: 8
+    offspring_size: 8
+    mutate_rate: 0.5
+trial:
+  command: OP=matmul N=512 K=1024 M=1024 P=NN ./run.sh
+  codeDir: /root
+  # gpuNum: 0
--- a/examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/search_space.json
+++ b/examples/trials/systems/opevo/src/experiments/mm/N512K1024M1024/search_space.json
+{"K": {"_type": "factor", "_value": [1024, 3]}, "X": {"_type": "factor", "_value": [512, 4]}, "Y": {"_type": "factor", "_value": [1024, 4]}}
--- a/examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_gbfs.yml
+++ b/examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_gbfs.yml
+authorName: default
+experimentName: MatMul_N512K1024M4096_GBFS
+trialConcurrency: 5
+maxExecDuration: 24h
+maxTrialNum: 512
+#choice: local, remote, pai
+trainingServicePlatform: local
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  codeDir: /root/algorithms/
+  classFileName: gbfs.py
+  className: G_BFS
+  # Any parameter need to pass to your tuner class __init__ constructor
+  # can be specified in this optional classArgs field, for example 
+  classArgs:
+    optimize_mode: maximize
+    num_samples: 5
+trial:
+  command: OP=matmul N=512 K=1024 M=4096 P=NN ./run.sh
+  codeDir: /root
+  # gpuNum: 0
--- a/examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_na2c.yml
+++ b/examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_na2c.yml
+authorName: default
+experimentName: MatMul_N512K1024M4096_NA2C
+trialConcurrency: 6
+maxExecDuration: 24h
+maxTrialNum: 512
+#choice: local, remote, pai
+trainingServicePlatform: local
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  codeDir: /root/algorithms/
+  classFileName: na2c.py
+  className: N_A2C
+  # Any parameter need to pass to your tuner class __init__ constructor
+  # can be specified in this optional classArgs field, for example 
+  classArgs:
+    optimize_mode: maximize
+trial:
+  command: OP=matmul N=512 K=1024 M=4096 P=NN ./run.sh
+  codeDir: /root
+  # gpuNum: 0
--- a/examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_opevo.yml
+++ b/examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/config_opevo.yml
+authorName: default
+experimentName: MatMul_N512K1024M4096_OPEVO
+trialConcurrency: 8
+maxExecDuration: 24h
+maxTrialNum: 512
+#choice: local, remote, pai
+trainingServicePlatform: local
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  codeDir: /root/algorithms/
+  classFileName: opevo.py
+  className: OpEvo
+  # Any parameter need to pass to your tuner class __init__ constructor
+  # can be specified in this optional classArgs field, for example 
+  classArgs:
+    optimize_mode: maximize
+    parents_size: 8
+    offspring_size: 8
+    mutate_rate: 0.5
+trial:
+  command: OP=matmul N=512 K=1024 M=4096 P=NN ./run.sh
+  codeDir: /root
+  # gpuNum: 0
--- a/examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/search_space.json
+++ b/examples/trials/systems/opevo/src/experiments/mm/N512K1024M4096/search_space.json
+{"K": {"_type": "factor", "_value": [1024, 3]}, "X": {"_type": "factor", "_value": [512, 4]}, "Y": {"_type": "factor", "_value": [4096, 4]}}
\ No newline at end of file
--- a/examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_gbfs.yml
+++ b/examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_gbfs.yml
+authorName: default
+experimentName: MatMul_N512K4096M1024_GBFS
+trialConcurrency: 5
+maxExecDuration: 24h
+maxTrialNum: 512
+#choice: local, remote, pai
+trainingServicePlatform: local
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  codeDir: /root/algorithms/
+  classFileName: gbfs.py
+  className: G_BFS
+  # Any parameter need to pass to your tuner class __init__ constructor
+  # can be specified in this optional classArgs field, for example 
+  classArgs:
+    optimize_mode: maximize
+    num_samples: 5
+trial:
+  command: OP=matmul N=512 K=4096 M=1024 P=NN ./run.sh
+  codeDir: /root
+  # gpuNum: 0
--- a/examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_na2c.yml
+++ b/examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_na2c.yml
+authorName: default
+experimentName: MatMul_N512K4096M1024_NA2C
+trialConcurrency: 6
+maxExecDuration: 24h
+maxTrialNum: 512
+#choice: local, remote, pai
+trainingServicePlatform: local
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  codeDir: /root/algorithms/
+  classFileName: na2c.py
+  className: N_A2C
+  # Any parameter need to pass to your tuner class __init__ constructor
+  # can be specified in this optional classArgs field, for example 
+  classArgs:
+    optimize_mode: maximize
+trial:
+  command: OP=matmul N=512 K=4096 M=1024 P=NN ./run.sh
+  codeDir: /root
+  # gpuNum: 0
--- a/examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_opevo.yml
+++ b/examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/config_opevo.yml
+authorName: default
+experimentName: MatMul_N512K4096M1024_OPEVO
+trialConcurrency: 8
+maxExecDuration: 24h
+maxTrialNum: 512
+#choice: local, remote, pai
+trainingServicePlatform: local
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  codeDir: /root/algorithms/
+  classFileName: opevo.py
+  className: OpEvo
+  # Any parameter need to pass to your tuner class __init__ constructor
+  # can be specified in this optional classArgs field, for example 
+  classArgs:
+    optimize_mode: maximize
+    parents_size: 8
+    offspring_size: 8
+    mutate_rate: 0.5
+trial:
+  command: OP=matmul N=512 K=4096 M=1024 P=NN ./run.sh
+  codeDir: /root
+  # gpuNum: 0
--- a/examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/search_space.json
+++ b/examples/trials/systems/opevo/src/experiments/mm/N512K4096M1024/search_space.json
+{"K": {"_type": "factor", "_value": [4096, 3]}, "X": {"_type": "factor", "_value": [512, 4]}, "Y": {"_type": "factor", "_value": [1024, 4]}}
\ No newline at end of file
--- a/examples/trials/systems/opevo/src/run.sh
+++ b/examples/trials/systems/opevo/src/run.sh
+#!/bin/bash -e
+
+cd $(dirname $0)
+
+export BACKEND=${BACKEND:-c-cuda}
+
+if [[ "${BACKEND}" == "c-cuda" ]]; then
+	export BACKEND="#cuda"
+fi
+
+if [[ "${BACKEND}" != "#cuda" ]]; then
+	export LD_LIBRARY_PATH=/opt/tvm/build
+else
+	export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+fi
+
+export HIP_PLATFORM=hcc
+export HSA_USERPTR_FOR_PAGED_MEM=0
+export PYTHONDONTWRITEBYTECODE=1
+export PYTHONPATH=/opt/tvm/python:/opt/tvm/topi/python:/opt/tvm/nnvm/python:/usr/local/rocm/src
+
+ldconfig
+
+time OP=${OP:-matmul} S=${S:-0} python3 ./compiler_auto_tune_stable.py "$@"
+
--- a/examples/trials/systems/opevo/src/templates/batch_matmul.py
+++ b/examples/trials/systems/opevo/src/templates/batch_matmul.py
+import numpy as np
+import tvm
+import logging
+import sys, time, subprocess
+from tvm import autotvm
+import topi
+import json
+from topi.util import get_const_tuple
+import os
+
+
+op_attributes = {
+  "B": int(os.environ['B']) if 'B' in os.environ else 6,
+  "N": int(os.environ['N']) if 'N' in os.environ else 1024,
+  "K": int(os.environ['K']) if 'K' in os.environ else 64,
+  "M": int(os.environ['M']) if 'M' in os.environ else 4096,
+  "P": os.environ['P'] if 'P' in os.environ else "NN",
+}
+
+@autotvm.template
+def get_template_op(**kargs):
+    batch = op_attributes["B"]
+    M = op_attributes["N"]
+    K = op_attributes["K"]
+    N = op_attributes["M"]
+    pose = op_attributes["P"]
+
+    if pose == 'NN':
+      A = tvm.placeholder((batch, M, K), name='A', dtype="float32")
+      B = tvm.placeholder((batch, K, N), name='B', dtype="float32")
+      k = tvm.reduce_axis((0, K), name='k')
+      C = tvm.compute((batch, M, N), lambda b, i, j: tvm.sum(
+          A[b, i, k] * B[b, k, j], axis=k), name='C')
+    elif pose == 'NT':
+      A = tvm.placeholder((batch, M, K), name='A', dtype="float32")
+      B = tvm.placeholder((batch, N, K), name='B', dtype="float32")
+      k = tvm.reduce_axis((0, K), name='k')
+      C = tvm.compute((batch, M, N), lambda b, i, j: tvm.sum(
+          A[b, i, k] * B[b, j, k], axis=k), name='C')
+    elif pose == 'TN':
+      A = tvm.placeholder((batch, K, M), name='A', dtype="float32")
+      B = tvm.placeholder((batch, K, N), name='B', dtype="float32")
+      k = tvm.reduce_axis((0, K), name='k')
+      C = tvm.compute((batch, M, N), lambda b, i, j: tvm.sum(
+          A[b, k, i] * B[b, k, j], axis=k), name='C')
+    elif pose == 'TT':
+      A = tvm.placeholder((batch, K, M), name='A', dtype="float32")
+      B = tvm.placeholder((batch, N, K), name='B', dtype="float32")
+      k = tvm.reduce_axis((0, K), name='k')
+      C = tvm.compute((batch, M, N), lambda b, i, j: tvm.sum(
+          A[b, k, i] * B[b, j, k], axis=k), name='C')
+    else:
+      raise
+
+    cfg = autotvm.get_config()
+    s = tvm.create_schedule(C.op)
+    AA = s.cache_read(A, "shared", [C])
+    AL = s.cache_read(AA, "local", [C])
+    BB = s.cache_read(B, "shared", [C])
+    BL = s.cache_read(BB, "local", [C])
+    CC = s.cache_write(C, "local")
+
+    b, y, x = C.op.axis
+    k = CC.op.reduce_axis[0]
+
+    cfg.define_split('B', cfg.axis(b), num_outputs=2)
+    bo, bi = cfg['B'].apply(s, C, b)
+
+    cfg.define_split('K', cfg.axis(k), num_outputs=3)
+    ko, kt, ki = cfg['K'].apply(s, CC, k)
+
+    block_x = tvm.thread_axis('blockIdx.x')
+    block_y = tvm.thread_axis('blockIdx.y')
+    block_z = tvm.thread_axis('blockIdx.z')
+    thread_x = tvm.thread_axis('threadIdx.x')
+    thread_y = tvm.thread_axis('threadIdx.y')
+    thread_z = tvm.thread_axis('threadIdx.z')
+
+    cfg.define_split('X', cfg.axis(y), num_outputs=4)
+    cfg.define_split('Y', cfg.axis(x), num_outputs=4)
+
+    by, tyz, ty, yi = cfg['X'].apply(s, C, y)
+    bx, txz, tx, xi = cfg['Y'].apply(s, C, x)
+
+    s[C].bind(bo, block_z)
+    s[C].bind(by, block_y)
+    s[C].bind(bx, block_x)
+    s[C].bind(tyz, tvm.thread_axis('vthread'))
+    s[C].bind(txz, tvm.thread_axis('vthread'))
+    s[C].bind(bi, thread_z)
+    s[C].bind(ty, thread_y)
+    s[C].bind(tx, thread_x)
+    s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi)
+
+    s[CC].compute_at(s[C], tx)
+
+    bo, yo, xo = CC.op.axis
+    s[CC].reorder(ko, kt, yo, xo, ki)
+    s[CC].unroll(kt)
+
+    for stage in [AL, BL]:
+        s[stage].compute_at(s[CC], kt)
+        s[stage].double_buffer()
+
+    for stage in [AA, BB]:
+        s[stage].compute_at(s[CC], ko)
+
+        fused = s[stage].fuse(*s[stage].op.axis)
+        ty, tx = s[stage].split(fused, nparts=cfg['X'].size[2])
+        tx, xi = s[stage].split(tx, nparts=cfg['Y'].size[2])
+        _, xi = s[stage].split(xi, factor=4)
+
+        s[stage].bind(ty, thread_y)
+        s[stage].bind(tx, thread_x)
+        s[stage].vectorize(xi)
+        s[stage].double_buffer()
+
+    cfg.add_flop(batch * M * K * N * 2.0)
+    return s, [A, B, C]
--- a/examples/trials/systems/opevo/src/templates/convfwd_direct.py
+++ b/examples/trials/systems/opevo/src/templates/convfwd_direct.py
+import numpy as np
+import tvm
+import logging
+import sys, time, subprocess
+from tvm import autotvm
+import topi
+import json
+from topi.util import get_const_tuple
+import os
+
+
+op_attributes = {
+  "N": int(os.environ['N']) if 'N' in os.environ else 64,
+  "C": int(os.environ['C']) if 'C' in os.environ else 3,
+  "H": int(os.environ['H']) if 'H' in os.environ else 229,
+  "W": int(os.environ['W']) if 'W' in os.environ else 229,
+  "F": int(os.environ['F']) if 'F' in os.environ else 32,
+  "K": int(os.environ['K']) if 'K' in os.environ else 5,
+  "ST": int(os.environ['ST']) if 'ST' in os.environ else 1,
+  "PD": int(os.environ['PD']) if 'PD' in os.environ else 2,
+}
+
+
+@autotvm.template
+def get_template_op(**kargs):
+    N = op_attributes["N"]
+    CI = op_attributes["C"]
+    H = op_attributes["H"]
+    W = op_attributes["W"]
+    H = op_attributes["H"]
+    CO = op_attributes["F"]
+    KH = KW = op_attributes["K"]
+    stride = op_attributes["ST"]
+    padding = op_attributes["PD"]
+    dilation = 1
+
+    data = tvm.placeholder((N, CI, H, W), name='data')
+    kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel')
+    conv = topi.nn.conv2d_nchw(
+        data, kernel, (stride, stride), (padding, padding), dilation=1, out_dtype='float32')
+    s = tvm.create_schedule([conv.op])
+    cfg = autotvm.get_config()
+
+    ##### space definition begin #####
+    n, f, y, x = s[conv].op.axis
+    rc, ry, rx = s[conv].op.reduce_axis
+    cfg.define_split("tile_f", f, num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=2)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 125, 256])
+
+    target = tvm.target.current_target()
+    if target.target_name in ['nvptx', 'rocm']:
+        cfg.define_knob("unroll_explicit", [1])
+    else:
+        cfg.define_knob("unroll_explicit", [0, 1])
+
+    pad_data, kernel = s[conv].op.input_tensors
+
+    s[pad_data].compute_inline()
+    if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    if conv.op in s.outputs:
+        output = conv
+        OL = s.cache_write(conv, 'local')
+    else:
+        output = s.outputs[0].output(0)
+        s[conv].set_scope('local')
+        OL = conv
+
+    # create cache stage
+    AA = s.cache_read(pad_data, 'shared', [OL])
+    WW = s.cache_read(kernel, 'shared', [OL])
+
+    # tile and bind spatial axes
+    n, f, y, x = s[output].op.axis
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    bf = s[output].fuse(n, bf)
+    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+    s[OL].compute_at(s[output], tx)
+
+    # tile reduction axes
+    n, f, y, x = s[OL].op.axis
+    rc, ry, rx = s[OL].op.reduce_axis
+    rco, rci = cfg['tile_rc'].apply(s, OL, rc)
+    ryo, ryi = cfg['tile_rx'].apply(s, OL, ry)
+    rxo, rxi = cfg['tile_ry'].apply(s, OL, rx)
+    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
+
+    s[AA].compute_at(s[OL], rxo)
+    s[WW].compute_at(s[OL], rxo)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        n, f, y, x = s[load].op.axis
+        fused = s[load].fuse(n, f, y, x)
+        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
+        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # unroll
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    N, CO, OH, OW = get_const_tuple(output.shape)
+    _, KH, KW, CI = get_const_tuple(kernel.shape)
+
+    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
+    return s, [data, kernel, conv]
--- a/examples/trials/systems/opevo/src/templates/matmul.py
+++ b/examples/trials/systems/opevo/src/templates/matmul.py
+import numpy as np
+import tvm
+import logging
+import sys, time, subprocess
+from tvm import autotvm
+import topi
+import json
+from topi.util import get_const_tuple
+import os
+
+
+op_attributes = {
+  "N": int(os.environ['N']) if 'N' in os.environ else 1024,
+  "K": int(os.environ['K']) if 'K' in os.environ else 64,
+  "M": int(os.environ['M']) if 'M' in os.environ else 4096,
+  "P": os.environ['P'] if 'P' in os.environ else "NN",
+}
+
+@autotvm.template
+def get_template_op(**kargs):
+    batch = op_attributes["N"]
+    in_dim = op_attributes["K"]
+    out_dim = op_attributes["M"]
+    pose = op_attributes["P"]
+
+    if pose == 'NN':
+      A = tvm.placeholder((batch, in_dim), name='A', dtype="float32")
+      B = tvm.placeholder((in_dim, out_dim), name='B', dtype="float32")
+      k = tvm.reduce_axis((0, in_dim), name='k')
+      C = tvm.compute((batch, out_dim), lambda i, j: tvm.sum(
+          A[i, k] * B[k, j], axis=k), name='C')
+    elif pose == 'NT':
+      A = tvm.placeholder((batch, in_dim), name='A', dtype="float32")
+      B = tvm.placeholder((out_dim, in_dim), name='B', dtype="float32")
+      k = tvm.reduce_axis((0, in_dim), name='k')
+      C = tvm.compute((batch, out_dim), lambda i, j: tvm.sum(
+          A[i, k] * B[j, k], axis=k), name='C')
+    elif pose == 'TN':
+      A = tvm.placeholder((in_dim, batch), name='A', dtype="float32")
+      B = tvm.placeholder((in_dim, out_dim), name='B', dtype="float32")
+      k = tvm.reduce_axis((0, in_dim), name='k')
+      C = tvm.compute((batch, out_dim), lambda i, j: tvm.sum(
+          A[k, i] * B[k, j], axis=k), name='C')
+    elif pose == 'TT':
+      A = tvm.placeholder((in_dim, batch), name='A', dtype="float32")
+      B = tvm.placeholder((out_dim, in_dim), name='B', dtype="float32")
+      k = tvm.reduce_axis((0, in_dim), name='k')
+      C = tvm.compute((batch, out_dim), lambda i, j: tvm.sum(
+          A[k, i] * B[j, k], axis=k), name='C')
+    else:
+      raise
+
+    cfg = autotvm.get_config()
+    s = tvm.create_schedule(C.op)
+
+    cfg.add_flop(batch * in_dim * out_dim * 2.0)
+
+    AA = s.cache_read(A, "shared", [C])
+    AL = s.cache_read(AA, "local", [C])
+    BB = s.cache_read(B, "shared", [C])
+    BL = s.cache_read(BB, "local", [C])
+    CC = s.cache_write(C, "local")
+
+    y, x = C.op.axis
+    k = CC.op.reduce_axis[0]
+
+    cfg.define_split('K', cfg.axis(k), num_outputs=3)
+    cfg.define_split('X', cfg.axis(y), num_outputs=4)
+    cfg.define_split('Y', cfg.axis(x), num_outputs=4)
+
+    ko, kt, ki = cfg['K'].apply(s, CC, k)
+
+    block_x = tvm.thread_axis('blockIdx.x')
+    block_y = tvm.thread_axis('blockIdx.y')
+    thread_x = tvm.thread_axis('threadIdx.x')
+    thread_y = tvm.thread_axis('threadIdx.y')
+
+    by, tyz, ty, yi = cfg['X'].apply(s, C, y)
+    bx, txz, tx, xi = cfg['Y'].apply(s, C, x)
+
+    s[C].bind(by, block_y)
+    s[C].bind(bx, block_x)
+    s[C].bind(tyz, tvm.thread_axis('vthread'))
+    s[C].bind(txz, tvm.thread_axis('vthread'))
+    s[C].bind(ty, thread_y)
+    s[C].bind(tx, thread_x)
+    s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi)
+
+    s[CC].compute_at(s[C], tx)
+
+    yo, xo = CC.op.axis
+    s[CC].reorder(ko, kt, yo, xo, ki)
+    s[CC].unroll(kt)
+
+    for stage in [AL, BL]:
+        s[stage].compute_at(s[CC], kt)
+
+    for stage in [AA, BB]:
+        s[stage].compute_at(s[CC], ko)
+
+        fused = s[stage].fuse(*s[stage].op.axis)
+        ty, tx = s[stage].split(fused, nparts=cfg['X'].size[2])
+        tx, xi = s[stage].split(tx, nparts=cfg['Y'].size[2])
+        _, xi = s[stage].split(xi, factor=4)
+
+        s[stage].bind(ty, thread_y)
+        s[stage].bind(tx, thread_x)
+        s[stage].vectorize(xi)
+        s[stage].double_buffer()
+
+    return s, [A, B, C]
--- a/examples/trials/systems/opevo/tvm_patches/libcuda.so.1
+++ b/examples/trials/systems/opevo/tvm_patches/libcuda.so.1
--- a/examples/trials/systems/opevo/tvm_patches/tvm_v0.6.patch
+++ b/examples/trials/systems/opevo/tvm_patches/tvm_v0.6.patch
+diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
+index 76d088f4c..7ed4ff02a 100644
+--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
+@@ -122,7 +122,7 @@ class Tuner(object):
+             configs = self.next_batch(min(n_parallel, n_trial - i))
+ 
+             inputs = [MeasureInput(self.task.target, self.task, config) for config in configs]
+-            results = measure_batch(inputs)
+            results = self.parse_configs(self.task, configs) if hasattr(self, 'parse_configs') else measure_batch(inputs)
+ 
+             # keep best config
+             for k, (inp, res) in enumerate(zip(inputs, results)):
+diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
+index eab542dd3..2f1a11303 100644
+--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
+@@ -808,6 +808,7 @@ void CodeGenC::VisitStmt_(const AttrStmt* op) {
+     IterVar iv = Downcast<IterVar>(op->node);
+     if (iv->thread_tag.length() != 0) {
+       if (!var_idmap_.count(iv->var.get())) {
+        this->currentOp = op;
+         BindThreadIndex(iv);
+       }
+     }
+diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h
+index 8701cda1e..7d3d56ddc 100644
+--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
+@@ -174,6 +174,8 @@ class CodeGenC :
+   // Get a cast type from to
+   virtual std::string CastFromTo(std::string value, Type from, Type target);
+ 
+  const AttrStmt* currentOp;
+
+  protected:
+   // Print reference to struct location
+   std::string GetStructRef(
+diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
+index 6656fa077..a4f0f962d 100644
+--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
+@@ -106,6 +106,9 @@ void CodeGenCUDA::BindThreadIndex(const IterVar& iv) {
+   CHECK(!var_idmap_.count(iv->var.get()));
+   var_idmap_[iv->var.get()] =
+       CastFromTo(iv->thread_tag, UInt(32), iv->var.type());
+  int nthread = static_cast<int>(this->currentOp->value.as<IntImm>()->value);
+  if (iv->thread_tag.find("threadIdx.") == 0 || iv->thread_tag.find("blockIdx.") == 0)
+    this->stream << "  // [thread_extent] " << iv->thread_tag << " = " << nthread << "\n";
+ }
+ 
+ void CodeGenCUDA::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
+diff --git a/src/codegen/opt/build_cuda_on.cc b/src/codegen/opt/build_cuda_on.cc
+index 1992ac5d9..9b0ff4cd9 100644
+--- a/src/codegen/opt/build_cuda_on.cc
+++ b/src/codegen/opt/build_cuda_on.cc
+@@ -137,6 +137,9 @@ runtime::Module BuildCUDA(Array<LoweredFunc> funcs) {
+     cg.AddFunction(f);
+   }
+   std::string code = cg.Finish();
+  const auto* backendproc = Registry::Get("tvm_callback_backend_proc");
+  if (backendproc)
+    return CUDAModuleCreate((*backendproc)(code).operator std::string(), "cubin", ExtractFuncInfo(funcs), code);
+ 
+   if (const auto* f = Registry::Get("tvm_callback_cuda_postproc")) {
+     code = (*f)(code).operator std::string();
+diff --git a/src/lang/expr_operator.cc b/src/lang/expr_operator.cc
+index 220d4378c..cc435d138 100644
+--- a/src/lang/expr_operator.cc
+++ b/src/lang/expr_operator.cc
+@@ -208,11 +208,11 @@ Expr operator%(Expr a, Expr b) {
+ 
+ // TODO(tqchen): switch to floordiv
+ Expr indexdiv(Expr a, Expr b) {
+-  return floordiv(a, b);
+  return truncdiv(a, b);
+ }
+ 
+ Expr indexmod(Expr a, Expr b) {
+-  return floormod(a, b);
+  return truncmod(a, b);
+ }
+ 
+ Expr floordiv(Expr a, Expr b) {