[Feature] Tvm integration (#2367)

Co-authored-by: Zihao Ye <expye@outlook.com>

[Feature] Tvm integration (#2367)
Co-authored-by: Zihao Ye <expye@outlook.com>
4208ce2b · Zhi Lin · GitHub · 035f1ae3 · 4208ce2b · 4208ce2b
Unverified Commit 4208ce2b authored Dec 31, 2020 by Zhi Lin Committed by GitHub Dec 31, 2020
15 changed files
--- a/.gitmodules
+++ b/.gitmodules
@@ -26,3 +26,6 @@
 [submodule "third_party/xbyak"]
 	path = third_party/xbyak
 	url = https://github.com/herumi/xbyak
+[submodule "third_party/tvm"]
+	path = third_party/tvm
+	url = https://github.com/apache/incubator-tvm
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,7 @@ endif()
 dgl_option(USE_CUDA "Build with CUDA" OFF)
 dgl_option(USE_OPENMP "Build with OpenMP" ON)
 dgl_option(USE_AVX "Build with AVX optimization" ON)
+dgl_option(USE_TVM "Build with TVM kernels" OFF)
 dgl_option(BUILD_CPP_TEST "Build cpp unittest executables" OFF)
 dgl_option(LIBCXX_ENABLE_PARALLEL_ALGORITHMS "Enable the parallel algorithms library. This requires the PSTL to be available." OFF)
 dgl_option(USE_S3 "Build with S3 support" OFF)
@@ -52,17 +53,6 @@ if(USE_CUDA)
  endif()
 endif(USE_CUDA)
-# include directories
-include_directories("include")
-include_directories("third_party/dlpack/include")
-include_directories("third_party/METIS/include/")
-include_directories("third_party/dmlc-core/include")
-include_directories("third_party/minigun/minigun")
-include_directories("third_party/minigun/third_party/moderngpu/src")
-include_directories("third_party/phmap/")
-include_directories("third_party/xbyak/")
-include_directories("tensoradapter/include")
 # initial variables
 if(NOT MSVC)
 set(DGL_LINKER_LIBS "dl")
@@ -165,6 +155,17 @@ else(USE_CUDA)
  add_library(dgl SHARED ${DGL_SRC})
 endif(USE_CUDA)
+# include directories
+target_include_directories(dgl PRIVATE "include")
+target_include_directories(dgl PRIVATE "third_party/dlpack/include")
+target_include_directories(dgl PRIVATE "third_party/dmlc-core/include")
+target_include_directories(dgl PRIVATE "third_party/minigun/minigun")
+target_include_directories(dgl PRIVATE "third_party/minigun/third_party/moderngpu/src")
+target_include_directories(dgl PRIVATE "third_party/phmap/")
+target_include_directories(dgl PRIVATE "third_party/xbyak/")
+target_include_directories(dgl PRIVATE "third_party/METIS/include/")
+target_include_directories(dgl PRIVATE "tensoradapter/include")
 # For serialization
 if (USE_HDFS)
  option(DMLC_HDFS_SHARED "dgl has to build with dynamic hdfs library" ON)
@@ -178,10 +179,23 @@ if(NOT MSVC)
  set(GKLIB_PATH "${CMAKE_SOURCE_DIR}/third_party/METIS/GKlib")
  include(${GKLIB_PATH}/GKlibSystem.cmake)
  include_directories(${GKLIB_PATH})
+  include_directories("third_party/METIS/include/")
  add_subdirectory("third_party/METIS/libmetis/")
  list(APPEND DGL_LINKER_LIBS metis)
 endif(NOT MSVC)
+# Compile TVM Runtime and Featgraph
+# (NOTE) We compile a dynamic library called featgraph_runtime, which the DGL library links to.
+# Kernels are packed in a separate dynamic library called featgraph_kernels, which DGL
+# will load during runtime.
+if(USE_TVM)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_TVM")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_TVM")
+  target_include_directories(dgl PRIVATE "featgraph/include")
+  add_subdirectory("featgraph/")
+  list(APPEND DGL_LINKER_LIBS featgraph_runtime)
+endif(USE_TVM)
 # support PARALLEL_ALGORITHMS
 if (LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
  add_definitions(-DPARALLEL_ALGORITHMS)
@@ -238,6 +252,10 @@ if(BUILD_CPP_TEST)
  add_subdirectory(./third_party/googletest)
  enable_testing()
  include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
+  include_directories("include")
+  include_directories("third_party/dlpack/include")
+  include_directories("third_party/xbyak")
+  include_directories("third_party/dmlc-core/include")
  file(GLOB_RECURSE TEST_SRC_FILES ${PROJECT_SOURCE_DIR}/tests/cpp/*.cc)
  add_executable(runUnitTests ${TEST_SRC_FILES})
  target_link_libraries(runUnitTests gtest gtest_main)

--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -21,6 +21,7 @@ Contributors
 * [Gongze Cao](https://github.com/Zardinality): Cluster GCN
 * [Yicheng Wu](https://github.com/MilkshakeForReal): RotatE in PyTorch
 * [Hao Xiong](https://github.com/ShawXh): DeepWalk in PyTorch
+* [Zhi Lin](https://github.com/kira-lin): Integrate FeatGraph into DGL
 Other improvement
 * [Brett Koonce](https://github.com/brettkoonce)

--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -34,14 +34,18 @@ set(USE_CUDA OFF)
 #---------------------------------------------
 # Misc.
 #---------------------------------------------
-# Whether to build cpp unittest executables
+# Whether to build cpp unittest executables.
 set(BUILD_CPP_TEST OFF)
-# Whether to enable OpenMP
+# Whether to enable OpenMP.
 set(USE_OPENMP ON)
-# Whether to enable Intel's avx optimized kernel
+# Whether to enable Intel's avx optimized kernel.
 set(USE_AVX ON)
-# Whether to build PyTorch plugins
+# Whether to build PyTorch plugins.
 set(BUILD_TORCH ON)
+# Whether to enable CUDA kernels compiled with TVM.
+set(USE_TVM OFF)
--- a/featgraph/CMakeLists.txt
+++ b/featgraph/CMakeLists.txt
+cmake_minimum_required(VERSION 3.5)
+project(featgraph C CXX)
+message(STATUS "Start configuring project ${PROJECT_NAME}")
+# Find CUDA
+include(../cmake/util/FindCUDA.cmake)
+find_cuda(ON)
+message(STATUS "${CUDA_INCLUDE_DIRS}")
+add_custom_target(
+  featgraph_kernel
+  COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/pack_featgraph.py
+  COMMENT "Creating featgraph kernels..."
+)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O2 -fPIC")
+file(GLOB FEATGRAPH_SRC
+  src/featgraph.cc
+  src/tvm_runtime_pack.cc
+)
+add_library(featgraph_runtime SHARED ${FEATGRAPH_SRC})
+target_include_directories(featgraph_runtime PRIVATE ${CUDA_INCLUDE_DIRS})
+target_include_directories(featgraph_runtime PRIVATE "./include")
+target_include_directories(featgraph_runtime PRIVATE "../third_party/tvm/include")
+target_include_directories(featgraph_runtime PRIVATE "../third_party/tvm/3rdparty/dmlc-core/include")
+target_include_directories(featgraph_runtime PRIVATE "../third_party/tvm/3rdparty/dlpack/include")
+target_link_libraries(featgraph_runtime "dl"  # dynamic linking
+                                        ${CUDA_CUDART_LIBRARY}
+                                        ${CUDA_CUDA_LIBRARY}
+                                        ${CUDA_NVRTC_LIBRARY})
+add_dependencies(featgraph_runtime featgraph_kernel)
+install(TARGETS featgraph_runtime LIBRARY DESTINATION lib)
--- a/featgraph/README.md
+++ b/featgraph/README.md
+# FeatGraph-DGL
+FeatGraph is an efficient backend for Graph Neural Networks based on TVM.
+- Original repo: https://github.com/amazon-research/FeatGraph
+- SC2020 Paper: https://www.csl.cornell.edu/~zhiruz/pdfs/featgraph-sc2020.pdf 
+This folder contains the code for integrating featgraph kernels to DGL.
+## Usage
+After building DGL with `USE_TVM=ON`, you should be able to run:
+```bash
+python test.py
+```
+to verify correctness.
+## Reference
+- [TVM Tutorial on Deploy TVM Module using C++ API](https://tvm.apache.org/docs/deploy/cpp_deploy.html).
--- a/featgraph/include/featgraph.h
+++ b/featgraph/include/featgraph.h
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file featgraph/include/featgraph.h
+ * \brief FeatGraph kernel headers.
+ */
+#ifndef FEATGRAPH_H_
+#define FEATGRAPH_H_
+#include <dlpack/dlpack.h>
+namespace dgl {
+namespace featgraph {
+/* \brief Load Featgraph module from given path. */
+void LoadFeatGraphModule(const std::string& path);
+/* \brief Call Featgraph's SDDMM kernel. */
+void SDDMMTreeReduction(DLManagedTensor* row, DLManagedTensor* col, 
+                        DLManagedTensor* lhs, DLManagedTensor* rhs, 
+                        DLManagedTensor* out);
+}  // namespace featgraph
+}  // namespace dgl
+#endif  // FEATGRAPH_H_
--- a/featgraph/pack_featgraph.py
+++ b/featgraph/pack_featgraph.py
+""" Export featgraph kernels to a shared library. """
+import tvm
+from sddmm import sddmm_tree_reduction_gpu
+def get_sddmm_kernels_gpu(idtypes, dtypes):
+    """
+    Parameters
+    ----------
+    idtypes: List[str]
+        Possible index types.
+    dtypes: List[str]
+        Possible data types.
+    Returns
+    -------
+    List[IRModule]:
+        The list of IRModules.
+    """
+    ret = []
+    # SDDMM Tree Reduction
+    for dtype in dtypes:
+        for idtype in idtypes:
+            ret.append(sddmm_tree_reduction_gpu(idtype, dtype))
+    return ret
+if __name__ == '__main__':
+    binary_path = 'libfeatgraph_kernels.so'
+    kernels = []
+    idtypes = ['int32', 'int64']
+    dtypes = ['float16', 'float64', 'float32', 'int32', 'int64']
+    kernels += get_sddmm_kernels_gpu(idtypes, dtypes)
+    # build kernels and export the module to libfeatgraph_kernels.so
+    module = tvm.build(kernels, target='cuda', target_host='llvm')
+    module.export_library(binary_path)
--- a/featgraph/sddmm.py
+++ b/featgraph/sddmm.py
+""" The compute function and schedules for SDDMM kernels written in TVM. """
+import tvm
+from tvm import te
+def sddmm_tree_reduction_gpu(idx_type, feat_type):
+    """ SDDMM kernels on GPU optimized with Tree Reduction.
+    Parameters
+    ----------
+    idx_type : str
+        The data type for indexing tensors.
+    feat_type : str
+        The data type of feature tensor.
+    Returns
+    -------
+    IRModule
+        The result IRModule.
+    """
+    # define vars and placeholders
+    nnz = te.var('nnz', idx_type)
+    num_rows = te.var('num_rows', idx_type)
+    num_cols = te.var('num_cols', idx_type)
+    H = te.var('num_heads', idx_type)
+    D = te.var('feat_len', idx_type)
+    row = te.placeholder((nnz,), idx_type, 'row')
+    col = te.placeholder((nnz,), idx_type, 'col')
+    ufeat = te.placeholder((num_rows, H, D), feat_type, 'ufeat')
+    vfeat = te.placeholder((num_cols, H, D), feat_type, 'vfeat')
+    # define edge computation function
+    def edge_func(eid, h, i):
+        k = te.reduce_axis((0, D), name='k')
+        return te.sum(ufeat[row[eid], h, k] * vfeat[col[eid], h, k], axis=k)
+    out = te.compute((nnz, H, tvm.tir.IntImm(idx_type, 1)), edge_func, name='out')
+    # define schedules
+    sched = te.create_schedule(out.op)
+    edge_axis, head_axis, _ = out.op.axis
+    reduce_axis = out.op.reduce_axis[0]
+    _, red_inner = sched[out].split(reduce_axis, factor=32)
+    edge_outer, edge_inner = sched[out].split(edge_axis, factor=32)
+    sched[out].bind(red_inner, te.thread_axis('threadIdx.x'))
+    sched[out].bind(edge_inner, te.thread_axis('threadIdx.y'))
+    sched[out].bind(edge_outer, te.thread_axis('blockIdx.x'))
+    sched[out].bind(head_axis, te.thread_axis('blockIdx.y'))
+    return tvm.lower(sched, [row, col, ufeat, vfeat, out],
+                     name='SDDMMTreeReduction_{}_{}'.format(idx_type, feat_type))
+if __name__ == '__main__':
+    kernel0 = sddmm_tree_reduction_gpu('int32', 'float32')
+    print(kernel0)
--- a/featgraph/src/featgraph.cc
+++ b/featgraph/src/featgraph.cc
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file featgraph/src/featgraph.cc
+ * \brief FeatGraph kernels.
+ */
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <dmlc/logging.h>
+#include <featgraph.h>
+namespace dgl {
+namespace featgraph {
+/* \brief Singleton that loads the featgraph module. */
+class FeatGraphModule {
+public:
+  static FeatGraphModule* Global() {
+    static FeatGraphModule inst;
+    return &inst;
+  }
+  void Load(const std::string& path) {
+    mod = tvm::runtime::Module::LoadFromFile(path);
+  }
+  inline tvm::runtime::ModuleNode* Get() {
+    auto ret = mod.operator->();
+    if (!ret) {
+      LOG(FATAL) << "FeatGraph module have not been loaded. "
+                 << "Please set path of featgraph shared library.";
+    }
+    return ret;
+  }
+private:
+  tvm::runtime::Module mod;
+  FeatGraphModule() {}
+};
+/* \brief Load Featgraph module from given path. */
+void LoadFeatGraphModule(const std::string& path) {
+  FeatGraphModule::Global()->Load(path);
+}
+/* \brief Convert DLDataType to string. */
+inline std::string DTypeAsStr(const DLDataType& t) {
+  switch(t.code) {
+    case 0U: return "int" + std::to_string(t.bits);
+    case 1U: return "uint" + std::to_string(t.bits);
+    case 2U: return "float" + std::to_string(t.bits);
+    case 3U: return "bfloat" + std::to_string(t.bits);
+    default: LOG(FATAL) << "Type code " << t.code << " not recognized";
+  }
+}
+/* \brief Get operator filename. */
+inline std::string GetOperatorName(
+    const std::string& base_name,
+    const DLDataType& dtype,
+    const DLDataType& idtype) {
+  return base_name + "_" + DTypeAsStr(dtype) + "_" + DTypeAsStr(idtype);
+}
+/* \brief Call FeatGraph's SDDMM kernel. */
+void SDDMMTreeReduction(DLManagedTensor* row, DLManagedTensor* col, 
+                        DLManagedTensor* lhs, DLManagedTensor* rhs, 
+                        DLManagedTensor* out) {
+  tvm::runtime::ModuleNode* mod = FeatGraphModule::Global()->Get();
+  std::string f_name = GetOperatorName("SDDMMTreeReduction",
+                                       (row->dl_tensor).dtype,
+                                       (lhs->dl_tensor).dtype);
+  tvm::runtime::PackedFunc f = mod->GetFunction(f_name);
+  if (f != nullptr)
+    f(row, col, lhs, rhs, out);
+}
+}  // namespace featgraph
+}  // namespace dgl
--- a/featgraph/src/tvm_runtime_pack.cc
+++ b/featgraph/src/tvm_runtime_pack.cc
+/*
+ * NOTE(zihao): this file was modified from TVM project:
+ * - https://github.com/apache/tvm/blob/9713d675c64ae3075e10be5acadeef1328a44bb5/apps/howto_deploy/tvm_runtime_pack.cc
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \brief This is an all in one TVM runtime file.
+ *
+ *   You only have to use this file to compile libtvm_runtime to
+ *   include in your project.
+ *
+ *  - Copy this file into your project which depends on tvm runtime.
+ *  - Compile with -std=c++14
+ *  - Add the following include path
+ *     - /path/to/tvm/include/
+ *     - /path/to/tvm/3rdparty/dmlc-core/include/
+ *     - /path/to/tvm/3rdparty/dlpack/include/
+ *   - Add -lpthread -ldl to the linked library.
+ *   - You are good to go.
+ *   - See the Makefile in the same folder for example.
+ *
+ *  The include files here are presented with relative path
+ *  You need to remember to change it to point to the right file.
+ *
+ */
+#include <../src/runtime/c_runtime_api.cc>
+#include <../src/runtime/cpu_device_api.cc>
+#include <../src/runtime/file_utils.cc>
+#include <../src/runtime/library_module.cc>
+#include <../src/runtime/module.cc>
+#include <../src/runtime/ndarray.cc>
+#include <../src/runtime/object.cc>
+#include <../src/runtime/registry.cc>
+#include <../src/runtime/thread_pool.cc>
+#include <../src/runtime/threading_backend.cc>
+#include <../src/runtime/workspace_pool.cc>
+// NOTE: all the files after this are optional modules
+// that you can include remove, depending on how much feature you use.
+// Likely we only need to enable one of the following
+// If you use Module::Load, use dso_module
+// For system packed library, use system_lib_module
+#include <../src/runtime/dso_library.cc>
+// #include <../src/runtime/system_library.cc>
+// Graph runtime
+// #include "../../src/runtime/graph/graph_runtime.cc"
+// #include "../../src/runtime/graph/graph_runtime_factory.cc"
+// Uncomment the following lines to enable RPC
+// #include "../../src/runtime/rpc/rpc_session.cc"
+// #include "../../src/runtime/rpc/rpc_event_impl.cc"
+// #include "../../src/runtime/rpc/rpc_server_env.cc"
+// These macros enables the device API when uncommented.
+#define TVM_CUDA_RUNTIME 1
+// #define TVM_METAL_RUNTIME 1
+// #define TVM_OPENCL_RUNTIME 1
+// Uncomment the following lines to enable Metal
+// #include "../../src/runtime/metal/metal_device_api.mm"
+// #include "../../src/runtime/metal/metal_module.mm"
+// Uncomment the following lines to enable CUDA
+#include <../src/runtime/cuda/cuda_device_api.cc>
+#include <../src/runtime/cuda/cuda_module.cc>
+// Uncomment the following lines to enable OpenCL
+// #include "../../src/runtime/opencl/opencl_device_api.cc"
+// #include "../../src/runtime/opencl/opencl_module.cc"
--- a/featgraph/test.py
+++ b/featgraph/test.py
+import torch
+import dgl
+import dgl.backend as F
+g = dgl.rand_graph(10, 15).int().to(torch.device(0))
+gidx = g._graph
+u = torch.rand((10,2,8), device=torch.device(0))
+v = torch.rand((10,2,8), device=torch.device(0))
+e = dgl.ops.gsddmm(g, 'dot', u, v)
+print(e)
+e = torch.zeros((15,2,1), device=torch.device(0))
+u = F.zerocopy_to_dgl_ndarray(u)
+v = F.zerocopy_to_dgl_ndarray(v)
+e = F.zerocopy_to_dgl_ndarray_for_write(e)
+dgl.sparse._CAPI_FG_LoadModule("../build/featgraph/libfeatgraph_kernels.so")
+dgl.sparse._CAPI_FG_SDDMMTreeReduction(gidx, u, v, e)
+print(e)
--- a/src/array/kernel.cc
+++ b/src/array/kernel.cc
@@ -6,6 +6,10 @@
 #include <dgl/packed_func_ext.h>
 #include <dgl/base_heterograph.h>
+#ifdef USE_TVM
+#include <featgraph.h>
+#endif  // USE_TVM
 #include "kernel_decl.h"
 #include "../c_api_common.h"
@@ -220,7 +224,38 @@ DGL_REGISTER_GLOBAL("sparse._CAPI_DGLKernelBwdSegmentCmp")
    CheckCtx(feat->ctx, {feat, arg, out}, {"feat", "arg", "out"});
    CheckContiguous({feat, arg, out}, {"feat", "arg", "out"});
    BackwardSegmentCmpDispatch(feat, arg, out);
-});
+  });
+#ifdef USE_TVM
+DGL_REGISTER_GLOBAL("sparse._CAPI_FG_LoadModule")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    const std::string path = args[0];
+    dgl::featgraph::LoadFeatGraphModule(path);
+  });
+DGL_REGISTER_GLOBAL("sparse._CAPI_FG_SDDMMTreeReduction")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    HeteroGraphRef graph = args[0];
+    NDArray lhs = args[1];
+    NDArray rhs = args[2];
+    NDArray out = args[3];
+    CheckCtx(graph->Context(), {lhs, rhs, out}, {"lhs", "rhs", "out"});
+    CheckContiguous({lhs, rhs, out}, {"lhs", "rhs", "out"});
+    CHECK_EQ(graph->NumEdgeTypes(), 1);
+    // auto pair = graph->meta_graph()->FindEdge(0);  // only one etype in the graph.
+    // const dgl_type_t src_vtype = pair.first;
+    // const dgl_type_t dst_vtype = pair.second;
+    // CheckShape(
+    //     {graph->NumVertices(src_vtype), graph->NumEdges(0), graph->NumVertices(dst_vtype)},
+    //     {lhs_target, rhs_target, 1},
+    //     {lhs, rhs, out},
+    //     {"U_data", "E_data", "V_data"});
+    COOMatrix coo = graph.sptr()->GetCOOMatrix(0);
+    dgl::featgraph::SDDMMTreeReduction(coo.row.ToDLPack(), coo.col.ToDLPack(),
+                                       lhs.ToDLPack(), rhs.ToDLPack(), out.ToDLPack());
+  });
+#endif  // USE_TVM
 }  // namespace aten
 }  // namespace dgl
--- a/tests/cpp/test_spmm.cc
+++ b/tests/cpp/test_spmm.cc
 #if !defined(_WIN32)
 #ifdef USE_AVX
-#include <../../src/array/cpu/spmm.h>
+#include <../src/array/cpu/spmm.h>
 #include <dgl/array.h>
 #include <gtest/gtest.h>
 #include <time.h>

--- a/tvm @ b2e418cb
+++ b/tvm @ b2e418cb
+Subproject commit b2e418cb109df4cd1f17a2cf2894a1b396a6b838