[Performance] Use allocator from PyTorch if possible (#2328)

* first commit * some thoughts * move around * more commit * more fixes * now it uses torch allocator * fix symbol export error * fix * fixes * test fix * add script * building separate library per version * fix for vs2019 * more fixes * fix on windows build * update jenkinsfile * auto copy built dlls for windows * lint and installation guide update * fix * specify conda environment * set environment for ci * fix * fix * fix * fix again * revert * fix cmake * fix * switch to using python interpreter path * remove scripts * debug * oops sorry * Update index.rst * Update index.rst * copies automatically, no need for this * do not print message if library not found * tiny fixes * debug on nightly * replace add_compile_definitions to make CMake 3.5 happy * fix linking to wrong lib for multiple pytorch envs * changed building strategy * fix nightly * fix windows * fix windows again * setup bugfix * address comments * change README

[Performance] Use allocator from PyTorch if possible (#2328)
* first commit * some thoughts * move around * more commit * more fixes * now it uses torch allocator * fix symbol export error * fix * fixes * test fix * add script * building separate library per version * fix for vs2019 * more fixes * fix on windows build * update jenkinsfile * auto copy built dlls for windows * lint and installation guide update * fix * specify conda environment * set environment for ci * fix * fix * fix * fix again * revert * fix cmake * fix * switch to using python interpreter path * remove scripts * debug * oops sorry * Update index.rst * Update index.rst * copies automatically, no need for this * do not print message if library not found * tiny fixes * debug on nightly * replace add_compile_definitions to make CMake 3.5 happy * fix linking to wrong lib for multiple pytorch envs * changed building strategy * fix nightly * fix windows * fix windows again * setup bugfix * address comments * change README
9a7235fa · Quan (Andy) Gan · GitHub · 4444a43a · 9a7235fa · 9a7235fa
Unverified Commit 9a7235fa authored Dec 25, 2020 by Quan (Andy) Gan Committed by GitHub Dec 25, 2020
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,9 +61,13 @@ include_directories("third_party/minigun/minigun")
 include_directories("third_party/minigun/third_party/moderngpu/src")
 include_directories("third_party/phmap/")
 include_directories("third_party/xbyak/")
+include_directories("tensoradapter/include")
 # initial variables
-set(DGL_LINKER_LIBS "")
+if(NOT MSVC)
+set(DGL_LINKER_LIBS "dl")
+endif(NOT MSVC)
 if(MSVC OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")
 set(DGL_RUNTIME_LINKER_LIBS "")
 else(MSVC OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")
@@ -110,7 +114,8 @@ if(USE_OPENMP)
 endif(USE_OPENMP)
 if(USE_AVX)
-  add_compile_definitions(USE_AVX)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_AVX")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_AVX")
 endif(USE_AVX)
 # To compile METIS correct for DGL.
@@ -183,6 +188,46 @@ if (LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
 endif(LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
 target_link_libraries(dgl ${DGL_LINKER_LIBS} ${DGL_RUNTIME_LINKER_LIBS})
+if(MSVC)
+  add_custom_command(
+    TARGET dgl POST_BUILD COMMAND
+    cmd.exe /c "COPY /Y Release\\dgl.dll .")
+endif(MSVC)
+# Tensor adapter libraries
+# Linking against LibTorch involves linking against a bunch of other libraries
+# returned by PyTorch's CMake (e.g. C10 or NVTools).  Because CMake caches
+# the found libraries in find_library(), often times CMake will look into the libraries
+# of the wrong version when I build everything in the same CMake process.  As
+# a result, I (BarclayII) am launching an individual CMake build for every PyTorch version.
+if(BUILD_TORCH)
+  file(TO_NATIVE_PATH ${CMAKE_CURRENT_BINARY_DIR} BINDIR)
+  file(TO_NATIVE_PATH ${CMAKE_COMMAND} CMAKE_CMD)
+  if(MSVC)
+    file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/tensoradapter/pytorch/build.bat BUILD_SCRIPT)
+    add_custom_target(
+      tensoradapter_pytorch
+      ${CMAKE_COMMAND} -E env
+      CMAKE_COMMAND=${CMAKE_CMD}
+      CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
+      BINDIR=${BINDIR}
+      cmd /e:on /c ${BUILD_SCRIPT} ${TORCH_PYTHON_INTERPS}
+      DEPENDS ${BUILD_SCRIPT}
+      WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/tensoradapter/pytorch)
+  else(MSVC)
+    file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/tensoradapter/pytorch/build.sh BUILD_SCRIPT)
+    add_custom_target(
+      tensoradapter_pytorch
+      ${CMAKE_COMMAND} -E env
+      CMAKE_COMMAND=${CMAKE_CMD}
+      CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
+      BINDIR=${CMAKE_CURRENT_BINARY_DIR}
+      bash ${BUILD_SCRIPT} ${TORCH_PYTHON_INTERPS}
+      DEPENDS ${BUILD_SCRIPT}
+      WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/tensoradapter/pytorch)
+  endif(MSVC)
+  add_dependencies(dgl tensoradapter_pytorch)
+endif(BUILD_TORCH)
 # Installation rules
 install(TARGETS dgl DESTINATION lib${LIB_SUFFIX})

--- a/Jenkinsfile
+++ b/Jenkinsfile
 #!/usr/bin/env groovy
-dgl_linux_libs = "build/libdgl.so, build/runUnitTests, python/dgl/_ffi/_cy3/core.cpython-36m-x86_64-linux-gnu.so"
+dgl_linux_libs = "build/libdgl.so, build/runUnitTests, python/dgl/_ffi/_cy3/core.cpython-36m-x86_64-linux-gnu.so, build/tensoradapter/pytorch/*.so"
 // Currently DGL on Windows is not working with Cython yet
-dgl_win64_libs = "build\\dgl.dll, build\\runUnitTests.exe"
+dgl_win64_libs = "build\\dgl.dll, build\\runUnitTests.exe, build\\tensoradapter\\pytorch\\*.dll"
 def init_git() {
  sh "rm -rf *"

--- a/README.md
+++ b/README.md
@@ -288,9 +288,7 @@ Right now, DGL works on [PyTorch](https://pytorch.org) 1.5.0+, [MXNet](https://m
 ```
 conda install -c dglteam dgl           # cpu version
-conda install -c dglteam dgl-cuda9.0   # CUDA 9.0
 conda install -c dglteam dgl-cuda9.2   # CUDA 9.2
-conda install -c dglteam dgl-cuda10.0  # CUDA 10.0
 conda install -c dglteam dgl-cuda10.1  # CUDA 10.1
 conda install -c dglteam dgl-cuda10.2  # CUDA 10.2
 conda install -c dglteam dgl-cuda11.0  # CUDA 11.0
@@ -302,9 +300,7 @@ conda install -c dglteam dgl-cuda11.0  # CUDA 11.0
 |           | Latest Nightly Build Version  | Stable Version          |
 |-----------|-------------------------------|-------------------------|
 | CPU       | `pip install --pre dgl`       | `pip install dgl`       |
-| CUDA 9.0  | `pip install --pre dgl-cu90`  | `pip install dgl-cu90`  |
 | CUDA 9.2  | `pip install --pre dgl-cu92`  | `pip install dgl-cu92`  |
-| CUDA 10.0 | `pip install --pre dgl-cu100` | `pip install dgl-cu100` |
 | CUDA 10.1 | `pip install --pre dgl-cu101` | `pip install dgl-cu101` |
 | CUDA 10.2 | `pip install --pre dgl-cu102` | `pip install dgl-cu102` |
 | CUDA 11.0 | `pip install --pre dgl-cu110` | `pip install dgl-cu110` |

--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -42,3 +42,6 @@ set(USE_OPENMP ON)
 # Whether to enable Intel's avx optimized kernel
 set(USE_AVX ON)
+# Whether to build PyTorch plugins
+set(BUILD_TORCH ON)
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -246,6 +246,9 @@ macro(dgl_config_cuda out_variable)
  # 0. Add host flags
  message(STATUS "${CMAKE_CXX_FLAGS}")
  string(REGEX REPLACE "[ \t\n\r]" "," CXX_HOST_FLAGS "${CMAKE_CXX_FLAGS}")
+  if(MSVC AND NOT USE_MSVC_MT)
+    string(CONCAT CXX_HOST_FLAGS ${CXX_HOST_FLAGS} ",/MD")
+  endif()
  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler ,${CXX_HOST_FLAGS}")
  # 1. Add arch flags
@@ -260,7 +263,7 @@ macro(dgl_config_cuda out_variable)
  include(CheckCXXCompilerFlag)
  check_cxx_compiler_flag("-std=c++14"    SUPPORT_CXX14)
  string(REPLACE "-std=c++11" "" CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
-  list(APPEND CUDA_NVCC_FLAGS "--std=c++14")
+  list(APPEND CUDA_NVCC_FLAGS "-std=c++14")
  message(STATUS "CUDA flags: ${CUDA_NVCC_FLAGS}")

--- a/docs/source/install/index.rst
+++ b/docs/source/install/index.rst
@@ -11,7 +11,7 @@ DGL works with the following operating systems:
 * macOS X
 * Windows 10
-DGL requires Python version 3.6 or later.
+DGL requires Python version 3.6, 3.7, 3.8 or 3.9.
 DGL supports multiple tensor libraries as backends, e.g., PyTorch, MXNet. For requirements on backends and how to select one, see :ref:`backends`.
@@ -121,34 +121,32 @@ install the Python binding for DGL.
 Windows
 ```````
-The Windows source build is tested with CMake and MinGW/GCC.  We highly recommend
+You can build DGL with MSBuild.  With `MS Build Tools <https://go.microsoft.com/fwlink/?linkid=840931>`_
-using CMake and GCC from `conda installations <https://conda.io/miniconda.html>`_.  To
+and `CMake on Windows <https://cmake.org/download/>`_ installed, run the following
-get started, run the following:
+in VS2019 x64 Native tools command prompt.
-.. code:: bash
-   conda install cmake m2w64-gcc m2w64-make
-Build the shared library and install the Python binding.
-.. code::
+- CPU only build
+  .. code::
-   md build
+     MD build
-   cd build
+     CD build
-   cmake -DCMAKE_CXX_FLAGS="-DDMLC_LOG_STACK_TRACE=0 -DDGL_EXPORTS" -DCMAKE_MAKE_PROGRAM=mingw32-make .. -G "MSYS Makefiles"
+     cmake -DCMAKE_CXX_FLAGS="/DDGL_EXPORTS" -DCMAKE_CONFIGURATION_TYPES="Release" -DDMLC_FORCE_SHARED_CRT=ON .. -G "Visual Studio 16 2019"
-   mingw32-make
+     msbuild dgl.sln /m
-   cd ..\python
+     CD ..\python
-   python setup.py install
+     python setup.py install
+- CUDA build
+  .. code::
-You can also build DGL with MSBuild.  With `MS Build Tools <https://go.microsoft.com/fwlink/?linkid=840931>`_
+     MD build
-and `CMake on Windows <https://cmake.org/download/>`_ installed, run the following
+     CD build
-in VS2017 x64 Native tools command prompt.
+     cmake -DCMAKE_CXX_FLAGS="/DDGL_EXPORTS" -DCMAKE_CONFIGURATION_TYPES="Release" -DDMLC_FORCE_SHARED_CRT=ON -DUSE_CUDA=ON .. -G "Visual Studio 16 2019"
+     msbuild dgl.sln /m
+     CD ..\python
+     python setup.py install
-.. code::
+Optional Flags
+``````````````
-   MD build
+- If you are using PyTorch, you can add ``-DBUILD_TORCH=ON`` flag in CMake
-   CD build
+  to build PyTorch plugins for further performance optimization.  This applies for Linux,
-   cmake -DCMAKE_CXX_FLAGS="/DDGL_EXPORTS" -DCMAKE_CONFIGURATION_TYPES="Release" .. -G "Visual Studio 15 2017 Win64"
+  Windows, and Mac.
-   msbuild dgl.sln
-   cd ..\python
-   python setup.py install
--- a/include/dgl/runtime/c_runtime_api.h
+++ b/include/dgl/runtime/c_runtime_api.h
@@ -540,6 +540,11 @@ DGL_DLL int DGLStreamStreamSynchronize(int device_type,
                                       DGLStreamHandle src,
                                       DGLStreamHandle dst);
+/*!
+ * \brief Sets the path to the tensoradapter library
+ */
+DGL_DLL void DGLSetTAPath(const char *path_cstr);
 /*!
 * \brief Bug report macro.
 *

--- a/include/dgl/runtime/env.h
+++ b/include/dgl/runtime/env.h
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dgl/runtime/env.h
+ * \brief Structure for holding DGL global environment variables
+ */
+#ifndef DGL_RUNTIME_ENV_H_
+#define DGL_RUNTIME_ENV_H_
+#include <string>
+/*!
+ * \brief Global environment variables.
+ */
+struct Env {
+  static Env* Global() {
+    static Env inst;
+    return &inst;
+  }
+  /*! \brief the path to the tensoradapter library */
+  std::string ta_path;
+};
+#endif  // DGL_RUNTIME_ENV_H_
--- a/include/dgl/runtime/tensordispatch.h
+++ b/include/dgl/runtime/tensordispatch.h
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/tensordispatch.h
+ * \brief This file defines the dispatcher of tensor operators to framework-specific
+ *  implementations.
+ *
+ *  The dispatcher consists of a TensorDispatcher singleton in DGL C library and
+ *  one separately-built shared library per supported backend.
+ *
+ *  Those shared libraries contain wrappers of the framework-specific operators.
+ *  The wrappers have almost the same signatures as functions in aten namespace,
+ *  except that they accept and return DLManagedTensors instead of NDArrays.
+ *  The wrappers are defined with extern "C", meaning that the C++ compiler will
+ *  not do name mangling for those functions so that DGL can conveniently locate
+ *  them using dlsym(3) (or GetProcAddress in Windows).
+ *
+ *  The TensorDispatcher singleton maintains a mapping from an array operator to
+ *  the address of the corresponding symbol in the shared library.  During
+ *  initialization, the TensorDispatcher checks which backend DGL is using.
+ *  It then locates and opens the corresponding shared library using dlopen(3) (or
+ *  LoadLibrary in Windows), and populates the said mapping above with dlsym(3)
+ *  (or GetProcAddress in Windows).
+ *
+ *  A tensor operator in TensorDispatcher first checks whether the corresponding symbol
+ *  address is found in the mapping.  If so, it calls the function located at the
+ *  symbol address instead, translating NDArrays to DLManagedTensors using
+ *  NDArray::ToDLPack(), and translates the DLManagedTensors in the return values
+ *  back to NDArrays using NDArray::FromDLPack().  If not, it falls back to the
+ *  implementation in dgl::aten namespace.
+ */
+#ifndef DGL_RUNTIME_TENSORDISPATCH_H_
+#define DGL_RUNTIME_TENSORDISPATCH_H_
+#include <dlpack/dlpack.h>
+#include <tensoradapter.h>
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#endif  // WIN32
+#include <vector>
+#include "ndarray.h"
+/*! \brief Casts a pointer \c entry to a function pointer with signature of \c func */
+#define FUNCCAST(func, entry)   (*reinterpret_cast<decltype(&(func))>(entry))
+namespace dgl {
+namespace runtime {
+/*!
+ * \brief Dispatcher that delegates the function calls to framework-specific C++ APIs.
+ */
+class TensorDispatcher {
+ public:
+  /*! \brief Get the singleton instance. */
+  static TensorDispatcher* Global() {
+    static TensorDispatcher inst;
+    return &inst;
+  }
+  /*! \brief Whether an adapter library is available */
+  inline bool IsAvailable() {
+    return available_;
+  }
+  /*!
+   * \brief Allocate an empty tensor.
+   *
+   * Used in NDArray::Empty().
+   */
+  inline NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx) const {
+    auto entry = entrypoints_[Op::kEmpty];
+    auto result = FUNCCAST(tensoradapter::TAempty, entry)(shape, dtype, ctx);
+    return NDArray::FromDLPack(result);
+  }
+ private:
+  /*! \brief ctor */
+  TensorDispatcher();
+  /*! \brief dtor */
+  ~TensorDispatcher();
+  /*!
+   * \brief List of symbols in the adapter library.
+   *
+   * Must match the functions in tensoradapter/include/tensoradapter.h.
+   */
+  static constexpr const char *names_[] = {
+    "TAempty",
+  };
+  /*! \brief Index of each function to the symbol list */
+  class Op {
+   public:
+    static constexpr int kEmpty = 0;
+  };
+  /*! \brief Number of functions */
+  static constexpr int num_entries_ = sizeof(names_) / sizeof(names_[0]);
+  /*! \brief Entrypoints of each function */
+  void* entrypoints_[num_entries_] = {nullptr};
+  bool available_ = false;
+#if defined(WIN32) || defined(_WIN32)
+  HINSTANCE handle_;
+#else   // !WIN32
+  void* handle_;
+#endif  // WIN32
+};
+};  // namespace runtime
+};  // namespace dgl
+#endif  // DGL_RUNTIME_TENSORDISPATCH_H_
--- a/python/dgl/__init__.py
+++ b/python/dgl/__init__.py
@@ -9,8 +9,7 @@ and transforming graphs.
 # This initializes Winsock and performs cleanup at termination as required
 import socket
-# Need to ensure that the backend framework is imported before load dgl libs,
+# Should import backend before importing anything else
-# otherwise weird cuda problem happens
 from .backend import load_backend, backend_name
 from . import function

--- a/python/dgl/_ffi/base.py
+++ b/python/dgl/_ffi/base.py
@@ -31,15 +31,17 @@ class DGLError(Exception):
 def _load_lib():
    """Load libary by searching possible path."""
    lib_path = libinfo.find_lib_path()
-    lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_GLOBAL)
+    lib = ctypes.CDLL(lib_path[0])
+    dirname = os.path.dirname(lib_path[0])
+    basename = os.path.basename(lib_path[0])
    # DMatrix functions
    lib.DGLGetLastError.restype = ctypes.c_char_p
-    return lib, os.path.basename(lib_path[0])
+    return lib, basename, dirname
 # version number
 __version__ = libinfo.__version__
 # library instance of nnvm
-_LIB, _LIB_NAME = _load_lib()
+_LIB, _LIB_NAME, _DIR_NAME = _load_lib()
 # The FFI mode of DGL
 _FFI_MODE = os.environ.get("DGL_FFI", "auto")
@@ -109,3 +111,26 @@ def decorate(func, fwrapped):
    """
    import decorator
    return decorator.decorate(func, fwrapped)
+def set_ta_path(backend, version):
+    """Tell DGL which tensoradapter library to look for symbols.
+    Parameters
+    ----------
+    backend : str
+        The backend (currently ``pytorch``, ``mxnet`` or ``tensorflow``).
+    version : str
+        The version number of the backend.
+    """
+    version = version.split('+')[0]
+    if sys.platform.startswith('linux'):
+        basename = 'libtensoradapter_%s_%s.so' % (backend, version)
+    elif sys.platform.startswith('darwin'):
+        basename = 'libtensoradapter_%s_%s.dylib' % (backend, version)
+    elif sys.platform.startswith('win'):
+        basename = 'tensoradapter_%s_%s.dll' % (backend, version)
+    else:
+        raise NotImplementedError('Unsupported system: %s' % sys.platform)
+    path = os.path.join(_DIR_NAME, 'tensoradapter', backend, basename)
+    _LIB.DGLSetTAPath(path.encode('utf-8'))
--- a/python/dgl/backend/__init__.py
+++ b/python/dgl/backend/__init__.py
@@ -19,6 +19,29 @@ def _gen_missing_api(api, mod_name):
    return _missing_api
 def load_backend(mod_name):
+    # Load backend does four things:
+    # (1) Import backend framework (PyTorch, MXNet, Tensorflow, etc.)
+    # (2) Import DGL C library.  DGL imports it *after* PyTorch/MXNet/Tensorflow.  Otherwise
+    #     DGL will crash with errors like `munmap_chunk(): invalid pointer`.
+    # (3) Sets up the tensoradapter library path.
+    # (4) Import the Python wrappers of the backend framework.  DGL does this last because
+    #     it already depends on both the backend framework and the DGL C library.
+    if mod_name == 'pytorch':
+        import torch
+        mod = torch
+    elif mod_name == 'mxnet':
+        import mxnet
+        mod = mxnet
+    elif mod_name == 'tensorflow':
+        import tensorflow
+        mod = tensorflow
+    else:
+        raise NotImplementedError('Unsupported backend: %s' % mod_name)
+    from .._ffi.base import set_ta_path # imports DGL C library
+    version = mod.__version__
+    set_ta_path(mod_name, version)
    print('Using backend: %s' % mod_name, file=sys.stderr)
    mod = importlib.import_module('.%s' % mod_name, __name__)
    thismod = sys.modules[__name__]

--- a/python/setup.py
+++ b/python/setup.py
@@ -35,11 +35,45 @@ def get_lib_path():
    return libs, version
+def get_ta_lib_pattern():
+    if sys.platform.startswith('linux'):
+        ta_lib_pattern = 'libtensoradapter_*.so'
+    elif sys.platform.startswith('darwin'):
+        ta_lib_pattern = 'libtensoradapter_*.dylib'
+    elif sys.platform.startswith('win'):
+        ta_lib_pattern = 'tensoradapter_*.dll'
+    else:
+        raise NotImplementedError('Unsupported system: %s' % sys.platform)
+    return ta_lib_pattern
 LIBS, VERSION = get_lib_path()
+BACKENDS = ['pytorch']
+TA_LIB_PATTERN = get_ta_lib_pattern()
+def cleanup():
+    # Wheel cleanup
+    try:
+        os.remove("MANIFEST.in")
+    except:
+        pass
+    for path in LIBS:
+        _, libname = os.path.split(path)
+        try:
+            os.remove(os.path.join("dgl", libname))
+        except:
+            pass
+    for backend in BACKENDS:
+        for ta_path in glob.glob(
+            os.path.join(CURRENT_DIR, "dgl", "tensoradapter", backend, TA_LIB_PATTERN)):
+            try:
+                os.remove(ta_path)
+            except:
+                pass
 def config_cython():
    """Try to configure cython and return cython configuration"""
-    if os.name == 'nt':
+    if sys.platform.startswith('win'):
        print("WARNING: Cython is not supported on Windows, will compile without cython module")
        return []
    sys_cflags = sysconfig.get_config_var("CFLAGS")
@@ -84,6 +118,8 @@ include_libs = False
 wheel_include_libs = False
 if "bdist_wheel" in sys.argv or os.getenv('CONDA_BUILD'):
    wheel_include_libs = True
+elif "clean" in sys.argv:
+    cleanup()
 else:
    include_libs = True
@@ -94,8 +130,18 @@ if wheel_include_libs:
    with open("MANIFEST.in", "w") as fo:
        for path in LIBS:
            shutil.copy(path, os.path.join(CURRENT_DIR, 'dgl'))
-            _, libname = os.path.split(path)
+            dir_, libname = os.path.split(path)
            fo.write("include dgl/%s\n" % libname)
+        for backend in BACKENDS:
+            for ta_path in glob.glob(os.path.join(dir_, "tensoradapter", backend, TA_LIB_PATTERN)):
+                ta_name = os.path.basename(ta_path)
+                os.makedirs(os.path.join(CURRENT_DIR, 'dgl', 'tensoradapter', backend), exist_ok=True)
+                shutil.copy(
+                    os.path.join(dir_, 'tensoradapter', backend, ta_name),
+                    os.path.join(CURRENT_DIR, 'dgl', 'tensoradapter', backend))
+                fo.write("include dgl/tensoradapter/%s/%s\n" % (backend, ta_name))
    setup_kwargs = {
        "include_package_data": True
    }
@@ -104,9 +150,17 @@ if wheel_include_libs:
 # Conda build also includes the binary library
 if include_libs:
    rpath = [os.path.relpath(path, CURRENT_DIR) for path in LIBS]
+    data_files = [('dgl', rpath)]
+    for path in LIBS:
+        for backend in BACKENDS:
+            data_files.append((
+                'dgl/tensoradapter/%s' % backend,
+                glob.glob(os.path.join(
+                    os.path.dirname(os.path.relpath(path, CURRENT_DIR)),
+                    'tensoradapter', backend, TA_LIB_PATTERN))))
    setup_kwargs = {
        "include_package_data": True,
-        "data_files": [('dgl', rpath)]
+        "data_files": data_files
    }
 setup(
@@ -136,8 +190,4 @@ setup(
 )
 if wheel_include_libs:
-    # Wheel cleanup
+    cleanup()
-    os.remove("MANIFEST.in")
-    for path in LIBS:
-        _, libname = os.path.split(path)
-        os.remove("dgl/%s" % libname)
--- a/src/array/cpu/array_sort.cc
+++ b/src/array/cpu/array_sort.cc
@@ -144,6 +144,11 @@ struct PairIterator : public std::iterator<std::random_access_iterator_tag,
    return PairRef<V1, V2>(row, col);
  }
+  // required for random access iterators in VS2019
+  PairRef<V1, V2> operator[](size_t offset) const {
+    return PairRef<V1, V2>(row + offset, col + offset);
+  }
  V1 *row;
  V2 *col;
 };

--- a/src/array/cpu/coo_sort.cc
+++ b/src/array/cpu/coo_sort.cc
@@ -151,6 +151,11 @@ struct CooIterator : public std::iterator<std::random_access_iterator_tag,
    return TupleRef<IdType>(row, col, data);
  }
+  // required for random access iterators in VS2019
+  TupleRef<IdType> operator[](size_t offset) const {
+    return TupleRef<IdType>(row + offset, col + offset, data + offset);
+  }
  IdType *row, *col, *data;
 };

--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -10,6 +10,7 @@
 #include <dgl/runtime/module.h>
 #include <dgl/runtime/registry.h>
 #include <dgl/runtime/device_api.h>
+#include <dgl/runtime/env.h>
 #include <array>
 #include <algorithm>
 #include <string>
@@ -378,6 +379,10 @@ int DGLCbArgToReturn(DGLValue* value, int code) {
  API_END();
 }
+void DGLSetTAPath(const char *path_cstr) {
+  Env::Global()->ta_path = std::string(path_cstr);
+}
 // set device api
 DGL_REGISTER_GLOBAL(dgl::runtime::symbol::dgl_set_device)
 .set_body([](DGLArgs args, DGLRetValue *ret) {

--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -10,6 +10,7 @@
 #include <dgl/runtime/device_api.h>
 #include <dgl/runtime/shared_mem.h>
 #include <dgl/zerocopy_serializer.h>
+#include <dgl/runtime/tensordispatch.h>
 #include "runtime_base.h"
 // deleter for arrays used by DLPack exporter
@@ -200,6 +201,10 @@ NDArray NDArray::EmptyShared(const std::string &name,
 NDArray NDArray::Empty(std::vector<int64_t> shape,
                       DLDataType dtype,
                       DLContext ctx) {
+  TensorDispatcher* td = TensorDispatcher::Global();
+  if (td->IsAvailable())
+    return td->Empty(shape, dtype, ctx);
  NDArray ret = Internal::Create(shape, dtype, ctx);
  // setup memory content
  size_t size = GetDataSize(ret.data_->dl_tensor);

--- a/src/runtime/tensordispatch.cc
+++ b/src/runtime/tensordispatch.cc
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file runtime/tensordispatch.cc
+ * \brief Adapter library caller
+ */
+#include <dgl/runtime/tensordispatch.h>
+#include <dgl/runtime/registry.h>
+#include <dgl/runtime/env.h>
+#include <dgl/packed_func_ext.h>
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#else   // !WIN32
+#include <dlfcn.h>
+#endif  // WIN32
+#include <cstring>
+namespace dgl {
+namespace runtime {
+constexpr const char *TensorDispatcher::names_[];
+TensorDispatcher::TensorDispatcher() {
+  const std::string& path = Env::Global()->ta_path;
+  if (path == "")
+    // does not have dispatcher library; all operators fall back to DGL's implementation
+    return;
+#if defined(WIN32) || defined(_WIN32)
+  handle_ = LoadLibrary(path.c_str());
+  if (!handle_)
+    return;
+  for (int i = 0; i < num_entries_; ++i)
+    entrypoints_[i] = reinterpret_cast<void*>(GetProcAddress(handle_, names_[i]));
+#else   // !WIN32
+  handle_ = dlopen(path.c_str(), RTLD_LAZY);
+  if (!handle_)
+    return;
+  for (int i = 0; i < num_entries_; ++i)
+    entrypoints_[i] = dlsym(handle_, names_[i]);
+#endif  // WIN32
+  available_ = true;
+}
+TensorDispatcher::~TensorDispatcher() {
+  if (handle_) {
+#if defined(WIN32) || defined(_WIN32)
+    FreeLibrary(handle_);
+#else   // !WIN32
+    dlclose(handle_);
+#endif  // WIN32
+  }
+}
+};  // namespace runtime
+};  // namespace dgl
--- a/tensoradapter/include/tensoradapter.h
+++ b/tensoradapter/include/tensoradapter.h
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file tensoradapter.h
+ * \brief Header file for functions exposed by the adapter library.
+ *
+ * Functions in this library must be exported with extern "C" so that DGL can locate
+ * them with dlsym(3) (or GetProcAddress on Windows).
+ */
+#ifndef TENSORADAPTER_H_
+#define TENSORADAPTER_H_
+#include <dlpack/dlpack.h>
+#include <vector>
+#if defined(WIN32) || defined(_WIN32)
+#define TA_EXPORTS __declspec(dllexport)
+#else
+#define TA_EXPORTS
+#endif
+namespace tensoradapter {
+extern "C" {
+/*!
+ * \brief Allocate an empty tensor
+ *
+ * \param shape The shape
+ * \param dtype The data type
+ * \param ctx The device
+ * \return The allocated tensor
+ */
+TA_EXPORTS DLManagedTensor* TAempty(
+    std::vector<int64_t> shape, DLDataType dtype, DLContext ctx);
+}
+};  // namespace tensoradapter
+#endif  // TENSORADAPTER_H_
--- a/tensoradapter/pytorch/CMakeLists.txt
+++ b/tensoradapter/pytorch/CMakeLists.txt
+cmake_minimum_required(VERSION 3.5)
+project(tensoradapter_pytorch C CXX)
+# Find PyTorch cmake files and PyTorch versions with the python interpreter $PYTHON_INTERP
+# (or "python" if empty)
+file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/find_cmake.py FIND_CMAKE_PY)
+if(NOT PYTHON_INTERP)
+  set(PYTHON_INTERP python)
+endif()
+message(STATUS "Using Python interpreter: ${PYTHON_INTERP}")
+execute_process(
+  COMMAND ${PYTHON_INTERP} ${FIND_CMAKE_PY}
+  OUTPUT_VARIABLE TORCH_PREFIX_VER
+  OUTPUT_STRIP_TRAILING_WHITESPACE)
+message(STATUS "find_cmake.py output: ${TORCH_PREFIX_VER}")
+list(GET TORCH_PREFIX_VER 0 TORCH_PREFIX)
+list(GET TORCH_PREFIX_VER 1 TORCH_VER)
+message(STATUS "Configuring for PyTorch ${TORCH_VER}")
+set(Torch_DIR "${TORCH_PREFIX}/Torch")
+message(STATUS "Setting directory to ${Torch_DIR}")
+find_package(Torch REQUIRED)
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TORCH_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb")
+set(TORCH_TARGET_NAME "tensoradapter_pytorch_${TORCH_VER}")
+file(GLOB TA_TORCH_SRC *.cpp)
+add_library(${TORCH_TARGET_NAME} SHARED "${TA_TORCH_SRC}")
+message(STATUS "tensoradapter found PyTorch includes: ${TORCH_INCLUDE_DIRS}")
+message(STATUS "tensoradapter found PyTorch lib: ${TORCH_LIBRARIES}")
+target_include_directories(
+  ${TORCH_TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../include")
+target_include_directories(
+  ${TORCH_TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/dlpack/include")
+target_include_directories(
+  ${TORCH_TARGET_NAME} PRIVATE "${TORCH_INCLUDE_DIRS}")
+target_link_libraries(${TORCH_TARGET_NAME} PRIVATE "${TORCH_LIBRARIES}")
+set_property(TARGET ${TORCH_TARGET_NAME} PROPERTY CXX_STANDARD 14)
+message(STATUS "Configured target ${TORCH_TARGET_NAME}")