Unverified Commit 6de15707 authored by gxiaotian's avatar gxiaotian Committed by GitHub
Browse files

Add OpEvo example (#2549)

parent 25c4c3b5
authorName: default
experimentName: MatMul_N512K1024M1024_NA2C
trialConcurrency: 6
maxExecDuration: 24h
maxTrialNum: 512
#choice: local, remote, pai
trainingServicePlatform: local
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
tuner:
codeDir: /root/algorithms/
classFileName: na2c.py
className: N_A2C
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs:
optimize_mode: maximize
trial:
command: OP=matmul N=512 K=1024 M=1024 P=NN ./run.sh
codeDir: /root
# gpuNum: 0
authorName: default
experimentName: MatMul_N512K1024M1024_OPEVO
trialConcurrency: 8
maxExecDuration: 24h
maxTrialNum: 512
#choice: local, remote, pai
trainingServicePlatform: local
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
tuner:
codeDir: /root/algorithms/
classFileName: opevo.py
className: OpEvo
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs:
optimize_mode: maximize
parents_size: 8
offspring_size: 8
mutate_rate: 0.5
trial:
command: OP=matmul N=512 K=1024 M=1024 P=NN ./run.sh
codeDir: /root
# gpuNum: 0
{"K": {"_type": "factor", "_value": [1024, 3]}, "X": {"_type": "factor", "_value": [512, 4]}, "Y": {"_type": "factor", "_value": [1024, 4]}}
authorName: default
experimentName: MatMul_N512K1024M4096_GBFS
trialConcurrency: 5
maxExecDuration: 24h
maxTrialNum: 512
#choice: local, remote, pai
trainingServicePlatform: local
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
tuner:
codeDir: /root/algorithms/
classFileName: gbfs.py
className: G_BFS
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs:
optimize_mode: maximize
num_samples: 5
trial:
command: OP=matmul N=512 K=1024 M=4096 P=NN ./run.sh
codeDir: /root
# gpuNum: 0
authorName: default
experimentName: MatMul_N512K1024M4096_NA2C
trialConcurrency: 6
maxExecDuration: 24h
maxTrialNum: 512
#choice: local, remote, pai
trainingServicePlatform: local
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
tuner:
codeDir: /root/algorithms/
classFileName: na2c.py
className: N_A2C
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs:
optimize_mode: maximize
trial:
command: OP=matmul N=512 K=1024 M=4096 P=NN ./run.sh
codeDir: /root
# gpuNum: 0
authorName: default
experimentName: MatMul_N512K1024M4096_OPEVO
trialConcurrency: 8
maxExecDuration: 24h
maxTrialNum: 512
#choice: local, remote, pai
trainingServicePlatform: local
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
tuner:
codeDir: /root/algorithms/
classFileName: opevo.py
className: OpEvo
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs:
optimize_mode: maximize
parents_size: 8
offspring_size: 8
mutate_rate: 0.5
trial:
command: OP=matmul N=512 K=1024 M=4096 P=NN ./run.sh
codeDir: /root
# gpuNum: 0
{"K": {"_type": "factor", "_value": [1024, 3]}, "X": {"_type": "factor", "_value": [512, 4]}, "Y": {"_type": "factor", "_value": [4096, 4]}}
\ No newline at end of file
authorName: default
experimentName: MatMul_N512K4096M1024_GBFS
trialConcurrency: 5
maxExecDuration: 24h
maxTrialNum: 512
#choice: local, remote, pai
trainingServicePlatform: local
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
tuner:
codeDir: /root/algorithms/
classFileName: gbfs.py
className: G_BFS
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs:
optimize_mode: maximize
num_samples: 5
trial:
command: OP=matmul N=512 K=4096 M=1024 P=NN ./run.sh
codeDir: /root
# gpuNum: 0
authorName: default
experimentName: MatMul_N512K4096M1024_NA2C
trialConcurrency: 6
maxExecDuration: 24h
maxTrialNum: 512
#choice: local, remote, pai
trainingServicePlatform: local
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
tuner:
codeDir: /root/algorithms/
classFileName: na2c.py
className: N_A2C
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs:
optimize_mode: maximize
trial:
command: OP=matmul N=512 K=4096 M=1024 P=NN ./run.sh
codeDir: /root
# gpuNum: 0
authorName: default
experimentName: MatMul_N512K4096M1024_OPEVO
trialConcurrency: 8
maxExecDuration: 24h
maxTrialNum: 512
#choice: local, remote, pai
trainingServicePlatform: local
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
tuner:
codeDir: /root/algorithms/
classFileName: opevo.py
className: OpEvo
# Any parameter need to pass to your tuner class __init__ constructor
# can be specified in this optional classArgs field, for example
classArgs:
optimize_mode: maximize
parents_size: 8
offspring_size: 8
mutate_rate: 0.5
trial:
command: OP=matmul N=512 K=4096 M=1024 P=NN ./run.sh
codeDir: /root
# gpuNum: 0
{"K": {"_type": "factor", "_value": [4096, 3]}, "X": {"_type": "factor", "_value": [512, 4]}, "Y": {"_type": "factor", "_value": [1024, 4]}}
\ No newline at end of file
#!/bin/bash -e
cd $(dirname $0)
export BACKEND=${BACKEND:-c-cuda}
if [[ "${BACKEND}" == "c-cuda" ]]; then
export BACKEND="#cuda"
fi
if [[ "${BACKEND}" != "#cuda" ]]; then
export LD_LIBRARY_PATH=/opt/tvm/build
else
export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
fi
export HIP_PLATFORM=hcc
export HSA_USERPTR_FOR_PAGED_MEM=0
export PYTHONDONTWRITEBYTECODE=1
export PYTHONPATH=/opt/tvm/python:/opt/tvm/topi/python:/opt/tvm/nnvm/python:/usr/local/rocm/src
ldconfig
time OP=${OP:-matmul} S=${S:-0} python3 ./compiler_auto_tune_stable.py "$@"
import numpy as np
import tvm
import logging
import sys, time, subprocess
from tvm import autotvm
import topi
import json
from topi.util import get_const_tuple
import os
op_attributes = {
"B": int(os.environ['B']) if 'B' in os.environ else 6,
"N": int(os.environ['N']) if 'N' in os.environ else 1024,
"K": int(os.environ['K']) if 'K' in os.environ else 64,
"M": int(os.environ['M']) if 'M' in os.environ else 4096,
"P": os.environ['P'] if 'P' in os.environ else "NN",
}
@autotvm.template
def get_template_op(**kargs):
batch = op_attributes["B"]
M = op_attributes["N"]
K = op_attributes["K"]
N = op_attributes["M"]
pose = op_attributes["P"]
if pose == 'NN':
A = tvm.placeholder((batch, M, K), name='A', dtype="float32")
B = tvm.placeholder((batch, K, N), name='B', dtype="float32")
k = tvm.reduce_axis((0, K), name='k')
C = tvm.compute((batch, M, N), lambda b, i, j: tvm.sum(
A[b, i, k] * B[b, k, j], axis=k), name='C')
elif pose == 'NT':
A = tvm.placeholder((batch, M, K), name='A', dtype="float32")
B = tvm.placeholder((batch, N, K), name='B', dtype="float32")
k = tvm.reduce_axis((0, K), name='k')
C = tvm.compute((batch, M, N), lambda b, i, j: tvm.sum(
A[b, i, k] * B[b, j, k], axis=k), name='C')
elif pose == 'TN':
A = tvm.placeholder((batch, K, M), name='A', dtype="float32")
B = tvm.placeholder((batch, K, N), name='B', dtype="float32")
k = tvm.reduce_axis((0, K), name='k')
C = tvm.compute((batch, M, N), lambda b, i, j: tvm.sum(
A[b, k, i] * B[b, k, j], axis=k), name='C')
elif pose == 'TT':
A = tvm.placeholder((batch, K, M), name='A', dtype="float32")
B = tvm.placeholder((batch, N, K), name='B', dtype="float32")
k = tvm.reduce_axis((0, K), name='k')
C = tvm.compute((batch, M, N), lambda b, i, j: tvm.sum(
A[b, k, i] * B[b, j, k], axis=k), name='C')
else:
raise
cfg = autotvm.get_config()
s = tvm.create_schedule(C.op)
AA = s.cache_read(A, "shared", [C])
AL = s.cache_read(AA, "local", [C])
BB = s.cache_read(B, "shared", [C])
BL = s.cache_read(BB, "local", [C])
CC = s.cache_write(C, "local")
b, y, x = C.op.axis
k = CC.op.reduce_axis[0]
cfg.define_split('B', cfg.axis(b), num_outputs=2)
bo, bi = cfg['B'].apply(s, C, b)
cfg.define_split('K', cfg.axis(k), num_outputs=3)
ko, kt, ki = cfg['K'].apply(s, CC, k)
block_x = tvm.thread_axis('blockIdx.x')
block_y = tvm.thread_axis('blockIdx.y')
block_z = tvm.thread_axis('blockIdx.z')
thread_x = tvm.thread_axis('threadIdx.x')
thread_y = tvm.thread_axis('threadIdx.y')
thread_z = tvm.thread_axis('threadIdx.z')
cfg.define_split('X', cfg.axis(y), num_outputs=4)
cfg.define_split('Y', cfg.axis(x), num_outputs=4)
by, tyz, ty, yi = cfg['X'].apply(s, C, y)
bx, txz, tx, xi = cfg['Y'].apply(s, C, x)
s[C].bind(bo, block_z)
s[C].bind(by, block_y)
s[C].bind(bx, block_x)
s[C].bind(tyz, tvm.thread_axis('vthread'))
s[C].bind(txz, tvm.thread_axis('vthread'))
s[C].bind(bi, thread_z)
s[C].bind(ty, thread_y)
s[C].bind(tx, thread_x)
s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi)
s[CC].compute_at(s[C], tx)
bo, yo, xo = CC.op.axis
s[CC].reorder(ko, kt, yo, xo, ki)
s[CC].unroll(kt)
for stage in [AL, BL]:
s[stage].compute_at(s[CC], kt)
s[stage].double_buffer()
for stage in [AA, BB]:
s[stage].compute_at(s[CC], ko)
fused = s[stage].fuse(*s[stage].op.axis)
ty, tx = s[stage].split(fused, nparts=cfg['X'].size[2])
tx, xi = s[stage].split(tx, nparts=cfg['Y'].size[2])
_, xi = s[stage].split(xi, factor=4)
s[stage].bind(ty, thread_y)
s[stage].bind(tx, thread_x)
s[stage].vectorize(xi)
s[stage].double_buffer()
cfg.add_flop(batch * M * K * N * 2.0)
return s, [A, B, C]
import numpy as np
import tvm
import logging
import sys, time, subprocess
from tvm import autotvm
import topi
import json
from topi.util import get_const_tuple
import os
op_attributes = {
"N": int(os.environ['N']) if 'N' in os.environ else 64,
"C": int(os.environ['C']) if 'C' in os.environ else 3,
"H": int(os.environ['H']) if 'H' in os.environ else 229,
"W": int(os.environ['W']) if 'W' in os.environ else 229,
"F": int(os.environ['F']) if 'F' in os.environ else 32,
"K": int(os.environ['K']) if 'K' in os.environ else 5,
"ST": int(os.environ['ST']) if 'ST' in os.environ else 1,
"PD": int(os.environ['PD']) if 'PD' in os.environ else 2,
}
@autotvm.template
def get_template_op(**kargs):
N = op_attributes["N"]
CI = op_attributes["C"]
H = op_attributes["H"]
W = op_attributes["W"]
H = op_attributes["H"]
CO = op_attributes["F"]
KH = KW = op_attributes["K"]
stride = op_attributes["ST"]
padding = op_attributes["PD"]
dilation = 1
data = tvm.placeholder((N, CI, H, W), name='data')
kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel')
conv = topi.nn.conv2d_nchw(
data, kernel, (stride, stride), (padding, padding), dilation=1, out_dtype='float32')
s = tvm.create_schedule([conv.op])
cfg = autotvm.get_config()
##### space definition begin #####
n, f, y, x = s[conv].op.axis
rc, ry, rx = s[conv].op.reduce_axis
cfg.define_split("tile_f", f, num_outputs=4)
cfg.define_split("tile_y", y, num_outputs=4)
cfg.define_split("tile_x", x, num_outputs=4)
cfg.define_split("tile_rc", rc, num_outputs=2)
cfg.define_split("tile_ry", ry, num_outputs=2)
cfg.define_split("tile_rx", rx, num_outputs=2)
cfg.define_knob("auto_unroll_max_step", [0, 125, 256])
target = tvm.target.current_target()
if target.target_name in ['nvptx', 'rocm']:
cfg.define_knob("unroll_explicit", [1])
else:
cfg.define_knob("unroll_explicit", [0, 1])
pad_data, kernel = s[conv].op.input_tensors
s[pad_data].compute_inline()
if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
s[kernel].compute_inline()
if conv.op in s.outputs:
output = conv
OL = s.cache_write(conv, 'local')
else:
output = s.outputs[0].output(0)
s[conv].set_scope('local')
OL = conv
# create cache stage
AA = s.cache_read(pad_data, 'shared', [OL])
WW = s.cache_read(kernel, 'shared', [OL])
# tile and bind spatial axes
n, f, y, x = s[output].op.axis
kernel_scope, n = s[output].split(n, nparts=1)
bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
bf = s[output].fuse(n, bf)
s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
s[output].bind(by, tvm.thread_axis("blockIdx.y"))
s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
s[output].bind(vf, tvm.thread_axis("vthread"))
s[output].bind(vy, tvm.thread_axis("vthread"))
s[output].bind(vx, tvm.thread_axis("vthread"))
s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
s[OL].compute_at(s[output], tx)
# tile reduction axes
n, f, y, x = s[OL].op.axis
rc, ry, rx = s[OL].op.reduce_axis
rco, rci = cfg['tile_rc'].apply(s, OL, rc)
ryo, ryi = cfg['tile_rx'].apply(s, OL, ry)
rxo, rxi = cfg['tile_ry'].apply(s, OL, rx)
s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
s[AA].compute_at(s[OL], rxo)
s[WW].compute_at(s[OL], rxo)
# cooperative fetching
for load in [AA, WW]:
n, f, y, x = s[load].op.axis
fused = s[load].fuse(n, f, y, x)
tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
# unroll
s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
N, CO, OH, OW = get_const_tuple(output.shape)
_, KH, KW, CI = get_const_tuple(kernel.shape)
cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
return s, [data, kernel, conv]
import numpy as np
import tvm
import logging
import sys, time, subprocess
from tvm import autotvm
import topi
import json
from topi.util import get_const_tuple
import os
op_attributes = {
"N": int(os.environ['N']) if 'N' in os.environ else 1024,
"K": int(os.environ['K']) if 'K' in os.environ else 64,
"M": int(os.environ['M']) if 'M' in os.environ else 4096,
"P": os.environ['P'] if 'P' in os.environ else "NN",
}
@autotvm.template
def get_template_op(**kargs):
batch = op_attributes["N"]
in_dim = op_attributes["K"]
out_dim = op_attributes["M"]
pose = op_attributes["P"]
if pose == 'NN':
A = tvm.placeholder((batch, in_dim), name='A', dtype="float32")
B = tvm.placeholder((in_dim, out_dim), name='B', dtype="float32")
k = tvm.reduce_axis((0, in_dim), name='k')
C = tvm.compute((batch, out_dim), lambda i, j: tvm.sum(
A[i, k] * B[k, j], axis=k), name='C')
elif pose == 'NT':
A = tvm.placeholder((batch, in_dim), name='A', dtype="float32")
B = tvm.placeholder((out_dim, in_dim), name='B', dtype="float32")
k = tvm.reduce_axis((0, in_dim), name='k')
C = tvm.compute((batch, out_dim), lambda i, j: tvm.sum(
A[i, k] * B[j, k], axis=k), name='C')
elif pose == 'TN':
A = tvm.placeholder((in_dim, batch), name='A', dtype="float32")
B = tvm.placeholder((in_dim, out_dim), name='B', dtype="float32")
k = tvm.reduce_axis((0, in_dim), name='k')
C = tvm.compute((batch, out_dim), lambda i, j: tvm.sum(
A[k, i] * B[k, j], axis=k), name='C')
elif pose == 'TT':
A = tvm.placeholder((in_dim, batch), name='A', dtype="float32")
B = tvm.placeholder((out_dim, in_dim), name='B', dtype="float32")
k = tvm.reduce_axis((0, in_dim), name='k')
C = tvm.compute((batch, out_dim), lambda i, j: tvm.sum(
A[k, i] * B[j, k], axis=k), name='C')
else:
raise
cfg = autotvm.get_config()
s = tvm.create_schedule(C.op)
cfg.add_flop(batch * in_dim * out_dim * 2.0)
AA = s.cache_read(A, "shared", [C])
AL = s.cache_read(AA, "local", [C])
BB = s.cache_read(B, "shared", [C])
BL = s.cache_read(BB, "local", [C])
CC = s.cache_write(C, "local")
y, x = C.op.axis
k = CC.op.reduce_axis[0]
cfg.define_split('K', cfg.axis(k), num_outputs=3)
cfg.define_split('X', cfg.axis(y), num_outputs=4)
cfg.define_split('Y', cfg.axis(x), num_outputs=4)
ko, kt, ki = cfg['K'].apply(s, CC, k)
block_x = tvm.thread_axis('blockIdx.x')
block_y = tvm.thread_axis('blockIdx.y')
thread_x = tvm.thread_axis('threadIdx.x')
thread_y = tvm.thread_axis('threadIdx.y')
by, tyz, ty, yi = cfg['X'].apply(s, C, y)
bx, txz, tx, xi = cfg['Y'].apply(s, C, x)
s[C].bind(by, block_y)
s[C].bind(bx, block_x)
s[C].bind(tyz, tvm.thread_axis('vthread'))
s[C].bind(txz, tvm.thread_axis('vthread'))
s[C].bind(ty, thread_y)
s[C].bind(tx, thread_x)
s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi)
s[CC].compute_at(s[C], tx)
yo, xo = CC.op.axis
s[CC].reorder(ko, kt, yo, xo, ki)
s[CC].unroll(kt)
for stage in [AL, BL]:
s[stage].compute_at(s[CC], kt)
for stage in [AA, BB]:
s[stage].compute_at(s[CC], ko)
fused = s[stage].fuse(*s[stage].op.axis)
ty, tx = s[stage].split(fused, nparts=cfg['X'].size[2])
tx, xi = s[stage].split(tx, nparts=cfg['Y'].size[2])
_, xi = s[stage].split(xi, factor=4)
s[stage].bind(ty, thread_y)
s[stage].bind(tx, thread_x)
s[stage].vectorize(xi)
s[stage].double_buffer()
return s, [A, B, C]
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 76d088f4c..7ed4ff02a 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -122,7 +122,7 @@ class Tuner(object):
configs = self.next_batch(min(n_parallel, n_trial - i))
inputs = [MeasureInput(self.task.target, self.task, config) for config in configs]
- results = measure_batch(inputs)
+ results = self.parse_configs(self.task, configs) if hasattr(self, 'parse_configs') else measure_batch(inputs)
# keep best config
for k, (inp, res) in enumerate(zip(inputs, results)):
diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index eab542dd3..2f1a11303 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -808,6 +808,7 @@ void CodeGenC::VisitStmt_(const AttrStmt* op) {
IterVar iv = Downcast<IterVar>(op->node);
if (iv->thread_tag.length() != 0) {
if (!var_idmap_.count(iv->var.get())) {
+ this->currentOp = op;
BindThreadIndex(iv);
}
}
diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h
index 8701cda1e..7d3d56ddc 100644
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -174,6 +174,8 @@ class CodeGenC :
// Get a cast type from to
virtual std::string CastFromTo(std::string value, Type from, Type target);
+ const AttrStmt* currentOp;
+
protected:
// Print reference to struct location
std::string GetStructRef(
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 6656fa077..a4f0f962d 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -106,6 +106,9 @@ void CodeGenCUDA::BindThreadIndex(const IterVar& iv) {
CHECK(!var_idmap_.count(iv->var.get()));
var_idmap_[iv->var.get()] =
CastFromTo(iv->thread_tag, UInt(32), iv->var.type());
+ int nthread = static_cast<int>(this->currentOp->value.as<IntImm>()->value);
+ if (iv->thread_tag.find("threadIdx.") == 0 || iv->thread_tag.find("blockIdx.") == 0)
+ this->stream << " // [thread_extent] " << iv->thread_tag << " = " << nthread << "\n";
}
void CodeGenCUDA::PrintType(Type t, std::ostream& os) { // NOLINT(*)
diff --git a/src/codegen/opt/build_cuda_on.cc b/src/codegen/opt/build_cuda_on.cc
index 1992ac5d9..9b0ff4cd9 100644
--- a/src/codegen/opt/build_cuda_on.cc
+++ b/src/codegen/opt/build_cuda_on.cc
@@ -137,6 +137,9 @@ runtime::Module BuildCUDA(Array<LoweredFunc> funcs) {
cg.AddFunction(f);
}
std::string code = cg.Finish();
+ const auto* backendproc = Registry::Get("tvm_callback_backend_proc");
+ if (backendproc)
+ return CUDAModuleCreate((*backendproc)(code).operator std::string(), "cubin", ExtractFuncInfo(funcs), code);
if (const auto* f = Registry::Get("tvm_callback_cuda_postproc")) {
code = (*f)(code).operator std::string();
diff --git a/src/lang/expr_operator.cc b/src/lang/expr_operator.cc
index 220d4378c..cc435d138 100644
--- a/src/lang/expr_operator.cc
+++ b/src/lang/expr_operator.cc
@@ -208,11 +208,11 @@ Expr operator%(Expr a, Expr b) {
// TODO(tqchen): switch to floordiv
Expr indexdiv(Expr a, Expr b) {
- return floordiv(a, b);
+ return truncdiv(a, b);
}
Expr indexmod(Expr a, Expr b) {
- return floormod(a, b);
+ return truncmod(a, b);
}
Expr floordiv(Expr a, Expr b) {
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment