0.9.1-rocm

a715222c · yuguo · f262efc9 · a715222c · a715222c · a715222c
Commit a715222c authored Feb 28, 2023 by yuguo
20 changed files
--- a/oneflow/api/python/caster/size.h
+++ b/oneflow/api/python/caster/size.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_API_PYTHON_CASTER_SIZE_H_
+#define ONEFLOW_API_PYTHON_CASTER_SIZE_H_
+#include <type_traits>
+#include <Python.h>
+#include <pybind11/pybind11.h>
+
+#include "oneflow/api/python/framework/size.h"
+#include "oneflow/core/common/shape.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class shape : public object {
+ public:
+  PYBIND11_OBJECT_CVT(shape, object, oneflow::TensorSize_Check, raw_shape)
+  explicit shape(size_t size = 0) : object(oneflow::TensorSize_New((ssize_t)size), stolen_t{}) {
+    if (!m_ptr) pybind11_fail("Could not allocate tensor size object!");
+  }
+  size_t size() const { return (size_t)PyTuple_Size(m_ptr); }
+  bool empty() const { return size() == 0; }
+  detail::tuple_accessor operator[](size_t index) const { return {*this, index}; }
+  detail::item_accessor operator[](handle h) const { return object::operator[](h); }
+  detail::tuple_iterator begin() const { return {*this, 0}; }
+  detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; }
+
+ private:
+  static PyObject* raw_shape(PyObject* op) {
+    if (oneflow::TensorSize_Check(op)) return handle(op).inc_ref().ptr();
+    return PyObject_CallFunctionObjArgs((PyObject*)&oneflow::TensorSize_Type, op, NULL);
+  }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template<typename T>
+struct shape_type_caster {
+ public:
+  bool load(handle src, bool convert) {
+    value_ = nullptr;
+    if (src && src.is_none()) { return true; }
+    if (!oneflow::TensorSize_Check(src.ptr())) { return false; }
+    value_ = std::make_shared<T>(oneflow::TensorSize_AsShape(src.ptr()));
+    return true;
+  }
+
+  template<typename U>
+  static handle cast(U&& src, return_value_policy /*policy*/, handle /*parent*/) {
+    return cast_impl(std::forward<U>(src));
+  }
+
+  template<typename U>
+  static handle cast(U* src, return_value_policy policy, handle parent) {
+    if (!src) { return none().release(); }
+    return cast(*src, policy, parent);
+  }
+
+  operator T*() { return value_.get(); }
+  operator T&() { return *value_; }
+  operator T&&() && { return std::move(*value_); }
+
+  operator std::shared_ptr<T>*() { return &value_; }
+  operator std::shared_ptr<T>&() { return value_; }
+  operator std::shared_ptr<T>&&() && { return std::move(value_); }
+
+  static constexpr auto name = _("shape");
+  template<typename U>
+  using cast_op_type = pybind11::detail::cast_op_type<std::shared_ptr<T>>;
+
+ private:
+  static handle cast_impl(const oneflow::Shape& src) {
+    return reinterpret_steal<shape>(oneflow::TensorSize_NewFromShape(src)).release();
+  }
+  static handle cast_impl(const std::shared_ptr<const oneflow::Shape>& src) {
+    return reinterpret_steal<shape>(oneflow::TensorSize_NewFromShape(*src)).release();
+  }
+
+ protected:
+  std::shared_ptr<T> value_;
+};
+
+template<>
+struct type_caster<oneflow::Shape> : public shape_type_caster<oneflow::Shape> {};
+template<>
+struct type_caster<std::shared_ptr<oneflow::Shape>> : public shape_type_caster<oneflow::Shape> {};
+template<>
+struct type_caster<std::shared_ptr<const oneflow::Shape>>
+    : public shape_type_caster<const oneflow::Shape> {};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#endif  // ONEFLOW_API_PYTHON_CASTER_SIZE_H_
--- a/oneflow/api/python/caster/tensor.h
+++ b/oneflow/api/python/caster/tensor.h
@@ -13,6 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#ifndef ONEFLOW_API_PYTHON_CASTER_TENSOR_H_
+#define ONEFLOW_API_PYTHON_CASTER_TENSOR_H_
+
 #include <pybind11/pybind11.h>

 #include "oneflow/api/python/caster/common.h"
@@ -100,3 +103,5 @@ struct type_caster<std::shared_ptr<const oneflow::one::Parameter>>

 }  // namespace detail
 }  // namespace pybind11
+
+#endif  // ONEFLOW_API_PYTHON_CASTER_TENSOR_H_
--- a/oneflow/api/python/dlpack/converter.cpp
+++ b/oneflow/api/python/dlpack/converter.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/api/python/dlpack/dlpack.h"
+#include "oneflow/api/python/exception/exception.h"
+#include "oneflow/api/python/of_api_registry.h"
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/core/eager/eager_blob_object.h"
+#include "oneflow/core/framework/tensor.h"
+#include "oneflow/core/framework/device.h"
+#include "oneflow/core/framework/tensor_util.h"
+
+namespace oneflow {
+
+Maybe<Symbol<Device>> ToOneFlowDevice(const DLDevice& ctx) {
+  switch (ctx.device_type) {
+    case DLDeviceType::kDLCPU: return JUST(Device::New("cpu"));
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+    case DLDeviceType::kDLCUDA: return JUST(Device::New("cuda", ctx.device_id));
+#endif
+    default: UNIMPLEMENTED_THEN_RETURN() << "Unsupported device type: " << ctx.device_type;
+  }
+}
+
+Maybe<DataType> ToOneFlowDataType(const DLDataType& dtype) {
+  DataType ofdtype = DataType::kInvalidDataType;
+  CHECK_EQ_OR_RETURN(dtype.lanes, 1) << "OneFlow does not support lanes != 1";
+  switch (dtype.code) {
+    case DLDataTypeCode::kDLUInt:
+      switch (dtype.bits) {
+        case 8: ofdtype = DataType::kUInt8; break;
+        default:
+          UNIMPLEMENTED_THEN_RETURN() << "Unsupported data type: " << dtype.code << dtype.bits;
+      }
+      break;
+    case DLDataTypeCode::kDLInt:
+      switch (dtype.bits) {
+        case 8: ofdtype = DataType::kInt8; break;
+        case 16: ofdtype = DataType::kInt16; break;
+        case 32: ofdtype = DataType::kInt32; break;
+        case 64: ofdtype = DataType::kInt64; break;
+        default:
+          UNIMPLEMENTED_THEN_RETURN() << "Unsupported data type: " << dtype.code << dtype.bits;
+      }
+      break;
+    case DLDataTypeCode::kDLFloat:
+      switch (dtype.bits) {
+        case 16: ofdtype = DataType::kFloat16; break;
+        case 32: ofdtype = DataType::kFloat; break;
+        case 64: ofdtype = DataType::kDouble; break;
+        default:
+          UNIMPLEMENTED_THEN_RETURN() << "Unsupported data type: " << dtype.code << dtype.bits;
+      }
+      break;
+    case DLDataTypeCode::kDLBfloat:
+      switch (dtype.bits) {
+        case 16: ofdtype = DataType::kBFloat16; break;
+        default: UNIMPLEMENTED_THEN_RETURN() << "Unsupported data type: bfloat" << dtype.bits;
+      }
+      break;
+    case DLDataTypeCode::kDLComplex:
+      UNIMPLEMENTED_THEN_RETURN() << "Unsupported data type: complex" << dtype.bits;
+      break;
+    default: UNIMPLEMENTED_THEN_RETURN() << "Unsupported code " << dtype.code;
+  }
+  CHECK_NE_OR_RETURN(ofdtype, DataType::kInvalidDataType);
+  return ofdtype;
+}
+
+Maybe<one::Tensor> fromDLPack(const DLManagedTensor* src) {
+  using namespace one;
+  const auto& dl_tensor = src->dl_tensor;
+
+  Symbol<Device> device = JUST(ToOneFlowDevice(dl_tensor.device));
+  DataType dtype = JUST(ToOneFlowDataType(dl_tensor.dtype));
+
+  // Build TensorMeta
+  const Shape shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim);
+  Symbol<LocalTensorMeta> tensor_meta;
+  if (dl_tensor.strides) {
+    const auto stride = Stride(dl_tensor.strides, dl_tensor.strides + dl_tensor.ndim);
+    tensor_meta = SymbolOf(LocalTensorMeta(shape, stride, dtype, device));
+  } else {
+    tensor_meta = SymbolOf(LocalTensorMeta(shape, dtype, device));
+  }
+
+  // Build TensorBuffer
+  const auto& Free = [src](char* dptr) {
+    if (src->deleter) {
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+      src->deleter(const_cast<DLManagedTensor*>(src));
+    }
+  };
+
+  size_t array_size_in_bytes = shape.elem_cnt() * GetSizeOfDataType(dtype);
+  auto tensor_data = std::make_shared<vm::OutsideVmTensorStorage>();
+  tensor_data->set_blob_dptr(
+      std::unique_ptr<char, std::function<void(char*)>>(static_cast<char*>(dl_tensor.data), Free),
+      array_size_in_bytes);
+
+  // Build TensorStorage: decrease ndarray reference count before releasing
+  auto tensor_storage = std::make_shared<TensorStorage>(tensor_data);
+
+  // Build Tensor
+  auto tensor_impl = std::make_shared<EagerLocalTensorImpl>(tensor_storage,
+                                                            /*requires_grad=*/false,
+                                                            /*ls_leaf=*/true);
+
+  // Init blob
+  JUST(tensor_impl->InitEagerBlobObject(tensor_meta, NewLocalDepObject()));
+  const auto& stream = JUST(GetDefaultStreamByDevice(device));
+  const auto& eager_blob_object = JUST(tensor_impl->eager_blob_object());
+  JUST(eager_blob_object->init_producer_stream(stream));
+  eager_blob_object->set_last_used_stream(stream);
+  return std::static_pointer_cast<Tensor>(std::make_shared<LocalTensor>(tensor_impl));
+}
+
+Maybe<DLDevice> ToDLDevice(Symbol<Device> ofdevice) {
+  DLDevice ctx;
+  ctx.device_id = ofdevice->device_id();
+  switch (ofdevice->enum_type()) {
+    case DeviceType::kCPU: ctx.device_type = DLDeviceType::kDLCPU; break;
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+    case DeviceType::kCUDA: ctx.device_type = DLDeviceType::kDLCUDA; break;
+#endif
+    default: UNIMPLEMENTED_THEN_RETURN() << "Unsupported device type: " << ofdevice->type();
+  }
+  return ctx;
+}
+
+Maybe<DLDataType> ToDLDataType(DataType ofdtype) {
+  DLDataType dtype;
+  dtype.lanes = 1;
+  dtype.bits = GetSizeOfDataType(ofdtype) * 8;
+  switch (ofdtype) {
+    case DataType::kUInt8: dtype.code = DLDataTypeCode::kDLUInt; break;
+    case DataType::kInt8: dtype.code = DLDataTypeCode::kDLInt; break;
+    case DataType::kInt16: dtype.code = DLDataTypeCode::kDLInt; break;
+    case DataType::kInt32: dtype.code = DLDataTypeCode::kDLInt; break;
+    case DataType::kInt64: dtype.code = DLDataTypeCode::kDLInt; break;
+    case DataType::kFloat16: dtype.code = DLDataTypeCode::kDLFloat; break;
+    case DataType::kFloat: dtype.code = DLDataTypeCode::kDLFloat; break;
+    case DataType::kDouble: dtype.code = DLDataTypeCode::kDLFloat; break;
+    case DataType::kBFloat16: dtype.code = DLDataTypeCode::kDLBfloat; break;
+    default: UNIMPLEMENTED_THEN_RETURN() << "Unsupported data type: " << DataType_Name(ofdtype);
+  }
+  return dtype;
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct ATenDLMTensor {
+  std::shared_ptr<one::Tensor> handle;
+  DLManagedTensor tensor;
+};
+
+void deleter(DLManagedTensor* arg) { delete static_cast<ATenDLMTensor*>(arg->manager_ctx); }
+
+Maybe<DLManagedTensor*> toDLPack(const std::shared_ptr<one::Tensor>& src) {
+  auto shape = *src->shape();
+  auto strides = *JUST(src->stride());
+  // create a new tensor with possibly normalized strides
+  // Reference:
+  // https://github.com/pytorch/pytorch/issues/83069
+  // https://github.com/pytorch/pytorch/issues/82610
+  for (int i = 0; i < src->ndim(); i++) {
+    if (shape[i] <= 1) { strides[i] = 1; }
+  }
+
+  ATenDLMTensor* atDLMTensor(new ATenDLMTensor);
+  atDLMTensor->handle = src;
+  atDLMTensor->tensor.manager_ctx = atDLMTensor;
+  atDLMTensor->tensor.deleter = &deleter;
+  JUST(one::SyncAccessTensorWithTimeOut(
+      src,
+      [&](ep::Stream*, const std::shared_ptr<vm::EagerBlobObject>& tensor) {
+        atDLMTensor->tensor.dl_tensor.data = tensor->mut_raw_dptr();
+      },
+      "const"));
+  auto dldevice = JUST(ToDLDevice(JUST(src->device())));
+  auto dldtype = JUST(ToDLDataType(src->dtype()->data_type()));
+  atDLMTensor->tensor.dl_tensor.device = *dldevice;
+  atDLMTensor->tensor.dl_tensor.ndim = src->ndim();
+  atDLMTensor->tensor.dl_tensor.dtype = *dldtype;
+  atDLMTensor->tensor.dl_tensor.shape =
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+      const_cast<int64_t*>(src->shape()->data());
+  atDLMTensor->tensor.dl_tensor.strides =
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+      const_cast<int64_t*>(JUST(src->stride())->data());
+  atDLMTensor->tensor.dl_tensor.byte_offset = 0;
+  return &(atDLMTensor->tensor);
+}
+
+// This function is mostly copied from PyTorch
+void DLPack_Capsule_Destructor(PyObject* data) {
+  if (likely(!PyCapsule_IsValid(data, "dltensor"))) {
+    // early out, see DLPack spec: if a consuming library sets the capsule
+    // name to something else, they own it and we don't need to do anything
+    return;
+  }
+  HANDLE_ERRORS
+  // Causes overheads for validity checks again, but this case is rare
+  // since consuming libraries should rename the capsule according to spec.
+  // Note that this cannot set a python error (we checked validity above),
+  // so we don't need to handle python error state here.
+  DLManagedTensor* dlMTensor = (DLManagedTensor*)PyCapsule_GetPointer(data, "dltensor");
+  // the dlMTensor has not been consumed, call deleter ourselves.
+  // DLPack spec mentions that deleter may be NULL, but deleter from
+  // `flow.to_dlpack` is never NULL, so no need for an additional check here.
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+  dlMTensor->deleter(const_cast<DLManagedTensor*>(dlMTensor));
+  END_HANDLE_ERRORS_RET()
+}
+
+namespace py = pybind11;
+
+ONEFLOW_API_PYBIND11_MODULE("", m) {
+  m.def("to_dlpack", [](const std::shared_ptr<one::Tensor>& tensor) -> Maybe<py::capsule> {
+    DLManagedTensor* dlMTensor = JUST(toDLPack(tensor));
+    return py::capsule(dlMTensor, "dltensor", DLPack_Capsule_Destructor);
+  });
+  // from_dlpack is exported in tensor_api.yaml
+}
+
+}  // namespace oneflow
--- a/oneflow/api/python/dlpack/converter.h
+++ b/oneflow/api/python/dlpack/converter.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/api/python/dlpack/dlpack.h"
+#include "oneflow/core/common/maybe.h"
+
+namespace oneflow {
+
+namespace one {
+class Tensor;
+}
+
+Maybe<one::Tensor> fromDLPack(const DLManagedTensor* src);
+Maybe<DLManagedTensor*> toDLPack(const std::shared_ptr<one::Tensor>& src);
+
+}  // namespace oneflow
--- a/oneflow/api/python/dlpack/dlpack.h
+++ b/oneflow/api/python/dlpack/dlpack.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+/**
+ * \brief Compatibility with C++
+ */
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current version of dlpack */
+#define DLPACK_VERSION 70
+
+/*! \brief The current ABI version of dlpack */
+#define DLPACK_ABI_VERSION 1
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!
+ * \brief The device type in DLDevice.
+ */
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
+typedef enum {
+#endif
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLCUDA = 2,
+  /*!
+   * \brief Pinned CUDA CPU memory by cudaMallocHost
+   */
+  kDLCUDAHost = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
+  /*!
+   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+   */
+  kDLROCMHost = 11,
+  /*!
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
+   */
+  kDLExtDev = 12,
+  /*!
+   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+   */
+  kDLCUDAManaged = 13,
+  /*!
+   * \brief Unified shared memory allocated on a oneAPI non-partititioned
+   * device. Call to oneAPI runtime is required to determine the device
+   * type, the USM allocation type and the sycl context it is bound to.
+   *
+   */
+  kDLOneAPI = 14,
+  /*! \brief GPU support for next generation WebGPU standard. */
+  kDLWebGPU = 15,
+  /*! \brief Qualcomm Hexagon DSP */
+  kDLHexagon = 16,
+} DLDeviceType;
+
+/*!
+ * \brief A Device for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*!
+   * \brief The device index.
+   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+   */
+  int32_t device_id;
+} DLDevice;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  /*! \brief signed integer */
+  kDLInt = 0U,
+  /*! \brief unsigned integer */
+  kDLUInt = 1U,
+  /*! \brief IEEE floating point */
+  kDLFloat = 2U,
+  /*!
+   * \brief Opaque handle type, reserved for testing purposes.
+   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+   */
+  kDLOpaqueHandle = 3U,
+  /*! \brief bfloat16 */
+  kDLBfloat = 4U,
+  /*!
+   * \brief complex number
+   * (C/C++/Python layout: compact struct per complex number)
+   */
+  kDLComplex = 5U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold. The data type is assumed to follow the
+ * native endian-ness. An explicit error message should be raised when attempting to
+ * export an array with non-native endianness
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes=1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+ *   - int8: type_code = 0, bits = 8, lanes=1
+ *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The data pointer points to the allocated data. This will be CUDA
+   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+   * types. This pointer is always aligned to 256 bytes as in CUDA. The
+   * `byte_offset` field should be used to point to the beginning of the data.
+   *
+   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+   * (after which this note will be updated); at the moment it is recommended
+   * to not rely on the data pointer being correctly aligned.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
+   *
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
+   */
+  void* data;
+  /*! \brief The device of the tensor */
+  DLDevice device;
+  /*! \brief Number of dimensions */
+  int32_t ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
+  /*!
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be NULL, indicating tensor is compact and row-majored.
+   */
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to facilitate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ */
+typedef struct DLManagedTensor {
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+  /*! \brief the context of the original host framework of DLManagedTensor in
+   *   which DLManagedTensor is used in the framework. It can also be NULL.
+   */
+  void* manager_ctx;
+  /*! \brief Destructor signature void (*)(void*) - this should be called
+   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+   *   if there is no way for the caller to provide a reasonable destructor.
+   *   The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensor* self);
+} DLManagedTensor;
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif  // DLPACK_DLPACK_H_
--- a/oneflow/api/python/eager/eager.cpp
+++ b/oneflow/api/python/eager/eager.cpp
@@ -22,7 +22,9 @@ ONEFLOW_API_PYBIND11_MODULE("eager", m) {
  using namespace oneflow;
  namespace py = pybind11;
  m.def(
-      "Sync", []() { return vm::ClusterSync(); }, py::call_guard<py::gil_scoped_release>());
+      "Sync", []() { return vm::CurrentRankSync(); }, py::call_guard<py::gil_scoped_release>());
+  m.def(
+      "ClusterSync", []() { return vm::ClusterSync(); }, py::call_guard<py::gil_scoped_release>());

  py::class_<one::DevVmDepObjectConsumeModeGuard,
             std::shared_ptr<one::DevVmDepObjectConsumeModeGuard>>(

--- a/oneflow/api/python/env/env.cpp
+++ b/oneflow/api/python/env/env.cpp
@@ -18,30 +18,85 @@ limitations under the License.
 #include "oneflow/api/python/of_api_registry.h"
 #include "oneflow/core/job/env_global_objects_scope.h"
 #include "oneflow/core/common/singleton.h"
+#include "oneflow/core/job/graph_scope_vars.h"
 #include "oneflow/core/vm/vm_util.h"
 #include "oneflow/core/vm/virtual_machine.h"
 #include "oneflow/core/framework/shut_down_util.h"
 #include "oneflow/core/device/cuda_util.h"

+#ifdef WITH_CUDA
+#include <cuda.h>
+#endif  // WITH_CUDA
+#ifdef WITH_ROCM
+#include <hip/hip_runtime.h>
+#endif  // WITH_ROCM
+
 namespace py = pybind11;

 namespace oneflow {

+#ifdef WITH_CUDA
+
+void RegisterCudaDeviceProperties(py::module& m) {
+  py::class_<cudaDeviceProp>(m, "_CudaDeviceProperties", py::module_local())
+      .def(py::init<>())
+      .def_readonly("name", &cudaDeviceProp::name)
+      .def_readonly("major", &cudaDeviceProp::major)
+      .def_readonly("minor", &cudaDeviceProp::minor)
+      .def_readonly("is_multi_gpu_board", &cudaDeviceProp::isMultiGpuBoard)
+      .def_readonly("is_integrated", &cudaDeviceProp::integrated)
+      .def_readonly("multi_processor_count", &cudaDeviceProp::multiProcessorCount)
+      .def_readonly("total_memory", &cudaDeviceProp::totalGlobalMem)
+      .def("__repr__", [](const cudaDeviceProp& prop) {
+        std::ostringstream stream;
+        stream << "_CudaDeviceProperties(name='" << prop.name << "', major=" << prop.major
+               << ", minor=" << prop.minor
+               << ", total_memory=" << prop.totalGlobalMem / (1024 * 1024)
+               << "MB, multi_processor_count=" << prop.multiProcessorCount << ")";
+        return stream.str();
+      });
+}
+
+#endif  // WITH_CUDA
+
+#ifdef WITH_ROCM
+
+void RegisterCudaDeviceProperties(py::module& m) {
+  py::class_<hipDeviceProp_t>(m, "_CudaDeviceProperties", py::module_local())
+      .def(py::init<>())
+      .def_readonly("name", &hipDeviceProp_t::name)
+      .def_readonly("major", &hipDeviceProp_t::major)
+      .def_readonly("minor", &hipDeviceProp_t::minor)
+      .def_readonly("is_multi_gpu_board", &hipDeviceProp_t::isMultiGpuBoard)
+      .def_readonly("is_integrated", &hipDeviceProp_t::integrated)
+      .def_readonly("multi_processor_count", &hipDeviceProp_t::multiProcessorCount)
+      .def_readonly("total_memory", &hipDeviceProp_t::totalGlobalMem)
+      .def("__repr__", [](const hipDeviceProp_t& prop) {
+        std::ostringstream stream;
+        stream << "_CudaDeviceProperties(name='" << prop.name << "', major=" << prop.major
+               << ", minor=" << prop.minor
+               << ", total_memory=" << prop.totalGlobalMem / (1024 * 1024)
+               << "MB, multi_processor_count=" << prop.multiProcessorCount << ")";
+        return stream.str();
+      });
+}
+
+#endif  // WITH_ROCM
+
 Maybe<void> SwitchToShuttingDownPhase(EnvGlobalObjectsScope* env, bool is_normal_exit) {
+  JUST(env->init_is_normal_exit(is_normal_exit));
+  SetShuttingDown(true);
  if (is_normal_exit) {
    JUST(vm::ClusterSync());
    auto* vm = JUST(SingletonMaybe<VirtualMachine>());
    JUST(vm->CloseVMThreads());
  }
-  JUST(env->init_is_normal_exit(is_normal_exit));
-  SetShuttingDown(true);
  return Maybe<void>::Ok();
 }

 ONEFLOW_API_PYBIND11_MODULE("", m) {
  m.def("CurrentResource", &CurrentResource);
  m.def("EnvResource", &EnvResource);
-  m.def("EnableEagerEnvironment", &EnableEagerEnvironment);

  py::class_<oneflow::EnvGlobalObjectsScope, std::shared_ptr<oneflow::EnvGlobalObjectsScope>>(
      m, "EnvContext")
@@ -57,19 +112,30 @@ ONEFLOW_API_PYBIND11_MODULE("", m) {
  m.def("GetLocalRank", &GetLocalRank);
  m.def("InitRDMA", &InitRDMA);
  m.def("RDMAIsInitialized", &RDMAIsInitialized);
+  m.def("DestoryRDMA", &DestoryRDMA);
  m.def("CudaGetDeviceCount", &CudaGetDeviceCount);
  m.def("EmptyCache", &EmptyCache);
 #ifdef WITH_CUDA
+  RegisterCudaDeviceProperties(m);
  m.def("GetCudaDeviceIndex", &GetCudaDeviceIndex);
  m.def("SetCudaDeviceIndex", &SetCudaDeviceIndex);
  m.def("CudaSynchronize", &CudaSynchronize);
  m.def("GetCUDAMemoryUsed", &GetCUDAMemoryUsed);
+  m.def(
+      "_get_device_properties",
+      [](int device) -> cudaDeviceProp* { return GetDeviceProperties(device); },
+      py::return_value_policy::reference);
 #endif  // WITH_CUDA
 #ifdef WITH_ROCM
+  RegisterCudaDeviceProperties(m);
  m.def("GetCudaDeviceIndex", &GetCudaDeviceIndex);
  m.def("SetCudaDeviceIndex", &SetCudaDeviceIndex);
  m.def("CudaSynchronize", &CudaSynchronize);
  m.def("GetCUDAMemoryUsed", &GetCUDAMemoryUsed);
+  m.def(
+      "_get_device_properties",
+      [](int device) -> hipDeviceProp_t* { return GetDeviceProperties(device); },
+      py::return_value_policy::reference);
 #endif  // WITH_ROCM
  m.def("SetFLAGS_alsologtostderr", &SetFLAGS_alsologtostderr);
  m.def("GetFLAGS_alsologtostderr", &GetFLAGS_alsologtostderr);
@@ -81,6 +147,10 @@ ONEFLOW_API_PYBIND11_MODULE("", m) {
  m.def("GetGraphDebugMaxPyStackDepth", &GetGraphDebugMaxPyStackDepth);
  m.def("SetGraphDebugMode", &SetGraphDebugMode);
  m.def("GetGraphDebugMode", &GetGraphDebugMode);
+  m.def("SetGraphDebugOnlyUserPyStack", &SetGraphDebugOnlyUserPyStack);
+  m.def("GetGraphDebugOnlyUserPyStack", &GetGraphDebugOnlyUserPyStack);
+  m.def("InitPythonPathsToBeKeptAndFilteredForDebugging",
+        &InitPythonPathsToBeKeptAndFilteredForDebugging);
 }

 }  // namespace oneflow
--- a/oneflow/api/python/env/env.h
+++ b/oneflow/api/python/env/env.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include <google/protobuf/text_format.h>
 #include "oneflow/core/common/protobuf.h"
 #include "oneflow/core/common/singleton.h"
-#include "oneflow/core/job/cluster.h"
 #include "oneflow/core/job/cluster_instruction.h"
 #include "oneflow/core/job/env_global_objects_scope.h"
 #include "oneflow/core/job/global_for.h"
@@ -44,12 +43,6 @@ inline Maybe<std::string> EnvResource() {
  return PbMessage2TxtString(Singleton<ResourceDesc, ForEnv>::Get()->resource());
 }

-inline Maybe<void> EnableEagerEnvironment(bool enable_eager_execution) {
-  CHECK_NOTNULL_OR_RETURN((Singleton<bool, EagerExecution>::Get()));
-  *Singleton<bool, EagerExecution>::Get() = enable_eager_execution;
-  return Maybe<void>::Ok();
-}
-
 inline Maybe<long long> CurrentMachineId() { return GlobalProcessCtx::Rank(); }

 inline Maybe<int64_t> GetRank() { return GlobalProcessCtx::Rank(); }

--- a/oneflow/api/python/framework/autocast.cpp
+++ b/oneflow/api/python/framework/autocast.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <pybind11/pybind11.h>
+#include "oneflow/api/python/of_api_registry.h"
+
+#include "oneflow/core/common/throw.h"
+#include "oneflow/core/framework/autocast.h"
+
+namespace py = pybind11;
+
+namespace oneflow {
+
+class AutoCastMode {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(AutoCastMode);
+
+  AutoCastMode(const std::string& device_type, Symbol<DType> dtype, bool enabled,
+               bool cache_enabled)
+      : prev_enabled_(autocast::is_enabled()),
+        prev_cache_enabled_(autocast::is_autocast_cache_enabled()),
+        prev_device_type_(autocast::get_autocast_device_type()),
+        prev_dtype_(autocast::get_autocast_dtype()),
+        prev_gpu_dtype_(autocast::get_autocast_gpu_dtype()),
+        prev_cpu_dtype_(autocast::get_autocast_cpu_dtype()) {
+    // update autocast state
+    autocast::set_enabled(enabled);
+    autocast::set_autocast_cache_enabled(cache_enabled);
+    if (device_type == "cpu") {
+      autocast::set_autocast_device_type(kCPU);
+      autocast::set_autocast_dtype(dtype);
+      autocast::set_autocast_cpu_dtype(dtype);
+    } else if (device_type == "cuda") {
+      autocast::set_autocast_device_type(kCUDA);
+      autocast::set_autocast_dtype(dtype);
+      autocast::set_autocast_gpu_dtype(dtype);
+    } else {
+      THROW(RuntimeError) << "User specified autocast device_type must be 'cuda' or 'cpu'";
+    }
+  }
+
+  ~AutoCastMode() {
+    autocast::set_enabled(prev_enabled_);
+    autocast::set_autocast_cache_enabled(prev_cache_enabled_);
+    autocast::set_autocast_device_type(prev_device_type_);
+    autocast::set_autocast_dtype(prev_dtype_);
+    autocast::set_autocast_gpu_dtype(prev_gpu_dtype_);
+    autocast::set_autocast_cpu_dtype(prev_cpu_dtype_);
+  }
+
+ private:
+  bool prev_enabled_;
+  bool prev_cache_enabled_;
+  DeviceType prev_device_type_;
+  Symbol<DType> prev_dtype_;
+  Symbol<DType> prev_gpu_dtype_;
+  Symbol<DType> prev_cpu_dtype_;
+};
+
+ONEFLOW_API_PYBIND11_MODULE("", m) {
+  py::class_<AutoCastMode, std::shared_ptr<AutoCastMode>>(m, "AutoCastMode")
+      .def(py::init([](const std::string& device_type, Symbol<DType> dtype, bool enabled,
+                       bool cache_enabled) {
+        return std::make_shared<AutoCastMode>(device_type, dtype, enabled, cache_enabled);
+      }));
+
+  m.def("is_autocast_enabled", autocast::is_enabled);
+  m.def("set_autocast_enabled", autocast::set_enabled);
+  m.def("get_autocast_gpu_dtype", autocast::get_autocast_gpu_dtype);
+  m.def("get_autocast_cpu_dtype", autocast::get_autocast_cpu_dtype);
+  m.def("set_autocast_gpu_dtype", autocast::set_autocast_gpu_dtype);
+  m.def("set_autocast_cpu_dtype", autocast::set_autocast_cpu_dtype);
+  m.def("is_autocast_cache_enabled", autocast::is_autocast_cache_enabled);
+  m.def("set_autocast_cache_enabled", autocast::set_autocast_cache_enabled);
+  m.def("clear_autocast_cache", autocast::clear_cache);
+}
+
+}  // namespace oneflow
--- a/oneflow/api/python/framework/device.cpp
+++ b/oneflow/api/python/framework/device.cpp
@@ -20,6 +20,7 @@ limitations under the License.
 #include "oneflow/core/framework/device.h"
 #include "oneflow/core/common/str_util.h"
 #include "oneflow/core/control/global_process_ctx.h"
+#include "oneflow/core/ep/include/device.h"

 namespace py = pybind11;

@@ -39,6 +40,10 @@ ONEFLOW_API_PYBIND11_MODULE("", m) {
      .def("__repr__", [](const Symbol<Device>& d) { return d->ToRepr(); })
      .def(py::self == py::self)
      .def(py::hash(py::self));
+
+  m.def(
+      "max_alignment_size", []() { return ep::kMaxAlignmentRequirement; },
+      py::return_value_policy::copy);
 }

 }  // namespace oneflow
--- a/oneflow/api/python/framework/dtype.cpp
+++ b/oneflow/api/python/framework/dtype.cpp
@@ -16,7 +16,10 @@ limitations under the License.
 #include <pybind11/pybind11.h>
 #include <pybind11/operators.h>
 #include "oneflow/api/python/of_api_registry.h"
+#include "oneflow/api/python/framework/tensortype.h"
+#include "oneflow/api/python/functional/common.h"
 #include "oneflow/core/framework/dtype.h"
+
 namespace py = pybind11;

 namespace oneflow {
@@ -66,6 +69,19 @@ ONEFLOW_API_PYBIND11_MODULE("", m) {
  m.attr("complex32") = &CHECK_JUST(DType::Get(DataType::kComplex32));
  m.attr("complex64") = &CHECK_JUST(DType::Get(DataType::kComplex64));
  m.attr("complex128") = &CHECK_JUST(DType::Get(DataType::kComplex128));
+
+  py::options options;
+  options.disable_function_signatures();
+  m.def("get_default_dtype", []() { return GetDefaultDType(); });
+  m.def("set_default_dtype",
+        [](const Symbol<DType>& dtype) { SetDefaultDType(dtype).GetOrThrow(); });
+  m.def("set_default_tensor_type", [](const py::object& tensor_type) {
+    if (one::PyTensorType_Check(tensor_type.ptr())) {
+      CHECK_JUST(SetDefaultDType(one::PyTensorType_UnpackDType(tensor_type.ptr())));
+    } else {
+      throw py::type_error("invalid type object");
+    }
+  });
 }

 }  // namespace oneflow
--- a/oneflow/api/python/framework/foreign_callback.cpp
+++ b/oneflow/api/python/framework/foreign_callback.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include <pybind11/pybind11.h>
-#include <string>
-#include "oneflow/api/python/of_api_registry.h"
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/job/foreign_callback.h"
-
-namespace py = pybind11;
-
-namespace oneflow {
-
-class PyForeignCallback : public ForeignCallback {
- public:
-  // Inherit the constructors
-  using ForeignCallback::ForeignCallback;
-
-  // Trampoline (need one for each virtual function)
-  void OfBlobCall(int64_t unique_id, int64_t ofblob_ptr) const override {
-    PYBIND11_OVERRIDE(void,                 /* Return type */
-                      ForeignCallback,      /* Parent class */
-                      OfBlobCall,           /* Name of function in C++ (must match Python name) */
-                      unique_id, ofblob_ptr /* Argument(s) */
-    );
-  }
-
-  void RemoveForeignCallback(int64_t unique_id) const override {
-    PYBIND11_OVERRIDE(void, ForeignCallback, RemoveForeignCallback, unique_id);
-  }
-};
-
-}  // namespace oneflow
-
-ONEFLOW_API_PYBIND11_MODULE("", m) {
-  using namespace oneflow;
-
-  py::class_<ForeignCallback, PyForeignCallback, std::shared_ptr<ForeignCallback>>(
-      m, "ForeignCallback")
-      .def(py::init<>())
-      .def("OfBlobCall", &ForeignCallback::OfBlobCall)
-      .def("RemoveForeignCallback", &ForeignCallback::RemoveForeignCallback);
-}
--- a/oneflow/api/python/framework/foreign_watcher.cpp
+++ b/oneflow/api/python/framework/foreign_watcher.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include <pybind11/pybind11.h>
-#include <string>
-#include "oneflow/api/python/of_api_registry.h"
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/job/foreign_watcher.h"
-
-namespace py = pybind11;
-
-namespace oneflow {
-
-class PyForeignWatcher : public ForeignWatcher {
- public:
-  using ForeignWatcher::ForeignWatcher;
-
-  void Call(const std::string& handler_uuid, int64_t ofblob_ptr) const override {
-    PYBIND11_OVERRIDE(void, ForeignWatcher, Call, handler_uuid, ofblob_ptr);
-  }
-};
-
-}  // namespace oneflow
-
-ONEFLOW_API_PYBIND11_MODULE("", m) {
-  using namespace oneflow;
-
-  py::class_<ForeignWatcher, PyForeignWatcher, std::shared_ptr<ForeignWatcher>>(m, "ForeignWatcher")
-      .def(py::init<>())
-      .def("Call", &ForeignWatcher::Call);
-}
--- a/oneflow/api/python/framework/framework.cpp
+++ b/oneflow/api/python/framework/framework.cpp
@@ -25,30 +25,11 @@ namespace py = pybind11;
 namespace oneflow {

 ONEFLOW_API_PYBIND11_MODULE("", m) {
-  m.def("RegisterGlobalForeignCallback", &RegisterGlobalForeignCallback);
-  m.def("DestroyGlobalForeignCallback", &DestroyGlobalForeignCallback);
-  m.def("RegisterGlobalWatcher", &RegisterGlobalWatcher);
-  m.def("LaunchJob", &LaunchJob, py::call_guard<py::gil_scoped_release>());
-
-  m.def("GetSerializedInterUserJobInfo",
-        []() -> Maybe<py::bytes> { return py::bytes(*JUST(GetSerializedInterUserJobInfo())); });
-  m.def("GetSerializedJobSet",
-        []() -> Maybe<py::bytes> { return py::bytes(*JUST(GetSerializedJobSet())); });
-  m.def("GetSerializedStructureGraph", &GetSerializedStructureGraph /* a prototxt saved to file*/);
  m.def("GetSerializedCurrentJob",
        []() -> Maybe<py::bytes> { return py::bytes(*JUST(GetSerializedCurrentJob())); });
-
  m.def("GetFunctionConfigDef", &GetFunctionConfigDef);
  m.def("GetScopeConfigDef", &GetScopeConfigDef);
-  m.def("GetMachine2DeviceIdListOFRecordFromParallelConf",
-        &GetSerializedMachineId2DeviceIdListOFRecord);
-
-  m.def("LoadSavedModel",
-        [](const std::string& saved_model_meta_file, bool is_prototxt_file) -> Maybe<py::bytes> {
-          return py::bytes(*JUST(LoadSavedModel(saved_model_meta_file, is_prototxt_file)));
-        });

-  m.def("EagerExecutionEnabled", EagerExecutionEnabled);
  m.def("LoadLibrary", &LoadLibrary);
 }


--- a/oneflow/api/python/framework/framework.h
+++ b/oneflow/api/python/framework/framework.h
@@ -25,82 +25,14 @@ limitations under the License.
 #include "oneflow/core/job/job_build_and_infer_ctx_mgr.h"
 #include "oneflow/core/job/job_desc.h"
 #include "oneflow/core/job/inter_user_job_info.pb.h"
-#include "oneflow/core/job/foreign_callback.h"
-#include "oneflow/core/job/foreign_watcher.h"
 #include "oneflow/core/job/job_instance.h"
 #include "oneflow/core/job/oneflow.h"
 #include "oneflow/core/job/placement.pb.h"
 #include "oneflow/core/framework/config_def.h"
 #include "oneflow/core/framework/load_library.h"
-#include "oneflow/core/serving/saved_model.pb.h"

 namespace oneflow {

-inline Maybe<void> RegisterGlobalForeignCallback(const std::shared_ptr<ForeignCallback>& callback) {
-  CHECK_ISNULL_OR_RETURN(Singleton<std::shared_ptr<ForeignCallback>>::Get())
-      << "foreign callback registered";
-  // Singleton<T>::SetAllocated is preferred since Singleton<T>::New will output logs but
-  // glog is not constructed yet.
-  Singleton<std::shared_ptr<ForeignCallback>>::SetAllocated(
-      new std::shared_ptr<ForeignCallback>(callback));
-  return Maybe<void>::Ok();
-}
-
-inline Maybe<void> DestroyGlobalForeignCallback() {
-  if (Singleton<std::shared_ptr<ForeignCallback>>::Get()) {
-    Singleton<std::shared_ptr<ForeignCallback>>::Delete();
-  }
-  return Maybe<void>::Ok();
-}
-
-inline Maybe<void> RegisterGlobalWatcher(const std::shared_ptr<ForeignWatcher>& watcher) {
-  CHECK_ISNULL_OR_RETURN(Singleton<std::shared_ptr<ForeignWatcher>>::Get())
-      << "foreign watcher registered";
-  // Singleton<T>::SetAllocated is preferred since Singleton<T>::New will output logs but
-  // glog is not constructed yet.
-  Singleton<std::shared_ptr<ForeignWatcher>>::SetAllocated(
-      new std::shared_ptr<ForeignWatcher>(watcher));
-  return Maybe<void>::Ok();
-}
-
-inline Maybe<void> LaunchJob(const std::shared_ptr<oneflow::JobInstance>& cb) {
-  CHECK_OR_RETURN(GlobalProcessCtx::IsThisProcessMaster());
-  CHECK_NOTNULL_OR_RETURN(Singleton<Oneflow>::Get());
-  const auto& job_name = cb->job_name();
-  auto* buffer_mgr = Singleton<BufferMgr<std::shared_ptr<JobInstance>>>::Get();
-  int64_t job_id = Singleton<JobName2JobId>::Get()->at(job_name);
-  if (IsPullJob(job_name, *Singleton<InterUserJobInfo>::Get())) {
-    buffer_mgr->Get(GetForeignOutputBufferName(job_name))->Push(cb);
-  }
-  if (IsPushJob(job_name, *Singleton<InterUserJobInfo>::Get())) {
-    buffer_mgr->Get(GetForeignInputBufferName(job_name))->Push(cb);
-  }
-  buffer_mgr->Get(GetCallbackNotifierBufferName(job_name))->Push(cb);
-  Singleton<BufferMgr<int64_t>>::Get()->Get(kBufferNameGlobalWaitJobId)->Push(job_id);
-  return Maybe<void>::Ok();
-}
-
-inline Maybe<std::string> GetSerializedStructureGraph() {
-  const auto* job_ctx_mgr = Singleton<LazyJobBuildAndInferCtxMgr>::Get();
-  CHECK_NOTNULL_OR_RETURN(job_ctx_mgr);
-  return job_ctx_mgr->structure_graph();
-}
-
-inline Maybe<std::string> GetSerializedInterUserJobInfo() {
-  CHECK_OR_RETURN(GlobalProcessCtx::IsThisProcessMaster());
-  CHECK_NOTNULL_OR_RETURN(Singleton<Oneflow>::Get());
-  CHECK_NOTNULL_OR_RETURN(Singleton<InterUserJobInfo>::Get());
-  return Singleton<InterUserJobInfo>::Get()->SerializeAsString();
-}
-
-inline Maybe<const JobSet&> GetJobSet() {
-  auto* job_ctx_mgr = JUST(GlobalJobBuildAndInferCtxMgr());
-  CHECK_NOTNULL_OR_RETURN(job_ctx_mgr);
-  return job_ctx_mgr->job_set();
-}
-
-inline Maybe<std::string> GetSerializedJobSet() { return JUST(GetJobSet()).SerializeAsString(); }
-
 inline Maybe<std::string> GetSerializedCurrentJob() {
  auto* job_ctx_mgr = Singleton<LazyJobBuildAndInferCtxMgr>::Get();
  CHECK_NOTNULL_OR_RETURN(job_ctx_mgr);
@@ -130,17 +62,6 @@ inline Maybe<std::string> GetSerializedMachineId2DeviceIdListOFRecord(
  return PbMessage2TxtString(*JUST(ParseMachineAndDeviceIdList(parallel_conf)));
 }

-inline Maybe<std::string> LoadSavedModel(const std::string& saved_model_meta_file,
-                                         bool is_prototxt_file) {
-  SavedModel saved_model_proto;
-  if (is_prototxt_file) {
-    CHECK_OR_RETURN(TryParseProtoFromTextFile(saved_model_meta_file, &saved_model_proto));
-  } else {
-    CHECK_OR_RETURN(TryParseProtoFromPbFile(saved_model_meta_file, &saved_model_proto));
-  }
-  return saved_model_proto.SerializeAsString();
-}
-
 inline Maybe<void> LoadLibraryNow(const std::string& lib_path) { return LoadLibrary(lib_path); }

 }  // namespace oneflow

--- a/oneflow/api/python/framework/global_mode.cpp
+++ b/oneflow/api/python/framework/global_mode.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include "oneflow/api/python/of_api_registry.h"
+#include "oneflow/core/common/symbol.h"
+#include "oneflow/core/common/throw.h"
+#include "oneflow/core/framework/nd_sbp.h"
+#include "oneflow/core/job/global_mode.h"
+
+namespace py = pybind11;
+
+namespace oneflow {
+
+ONEFLOW_API_PYBIND11_MODULE("global_view", m) {
+  py::class_<GlobalMode::Guard, std::shared_ptr<GlobalMode::Guard>>(m, "global_mode")
+      .def(py::init([](const bool enabled) {
+        if (enabled) {
+          THROW(RuntimeError) << "To enable global mode, placement and sbp must be provided.";
+        }
+        return std::make_shared<GlobalMode::Guard>(enabled);
+      }))
+      .def(py::init([](const bool enabled, const Symbol<ParallelDesc>& placement,
+                       const std::vector<Symbol<SbpParallel>>& sbp) {
+             if (!enabled) {
+               THROW(RuntimeError)
+                   << "To disable global mode, placement and sbp must not be provided.";
+             }
+             return std::make_shared<GlobalMode::Guard>(enabled, CHECK_JUST(GetNdSbp(sbp)),
+                                                        placement);
+           }),
+           py::arg("enabled").none(false), py::arg("placement").none(false),
+           py::arg("sbp").none(false))
+      .def(py::init([](const bool enabled, const Symbol<ParallelDesc>& placement,
+                       const Symbol<SbpParallel>& sbp) {
+             return std::make_shared<GlobalMode::Guard>(enabled, CHECK_JUST(SbpToNdSbp(sbp)),
+                                                        placement);
+           }),
+           py::arg("enabled").none(false), py::arg("placement").none(false),
+           py::arg("sbp").none(false))
+      .def("__enter__", [](const GlobalMode::Guard& guard_obj) {})
+      .def("__exit__", [](const GlobalMode::Guard& guard_obj, const py::object& type,
+                          const py::object& value, const py::object& traceback) {});
+
+  py::class_<GlobalMode, std::shared_ptr<GlobalMode>>(m, "current_global_mode")
+      .def(py::init([]() { return std::make_shared<GlobalMode>(); }))
+      .def_property_readonly("is_enabled", [](const GlobalMode& gm) { return gm.is_enabled(); })
+      .def_property_readonly("sbp",
+                             [](const GlobalMode& gm) {
+                               if (!gm.is_enabled()) {
+                                 THROW(RuntimeError)
+                                     << "Current global mode is disabled, there is no sbp.";
+                               }
+                               const auto& nd_sbp = gm.nd_sbp();
+                               auto tuple = py::tuple(nd_sbp->sbp_parallel_size());
+                               for (int i = 0; i < nd_sbp->sbp_parallel_size(); ++i) {
+                                 tuple[i] = SymbolOf(nd_sbp->sbp_parallel(i));
+                               }
+                               return tuple;
+                             })
+      .def_property_readonly("placement", [](const GlobalMode& gm) {
+        if (!gm.is_enabled()) {
+          THROW(RuntimeError) << "Current global mode is disabled, there is no placement.";
+        }
+        return gm.parallel_desc();
+      });
+}
+
+}  // namespace oneflow
--- a/oneflow/api/python/framework/instructions_builder.cpp
+++ b/oneflow/api/python/framework/instructions_builder.cpp
@@ -44,29 +44,29 @@ ONEFLOW_API_PYBIND11_MODULE("deprecated", m) {
          [](const std::shared_ptr<InstructionsBuilder>& builder, int64_t session_id,
             const std::string& job_conf_str, const std::string& device_tag,
             const std::vector<std::string>& machine_device_ids,
-             const std::shared_ptr<Shape>& hierarchy, bool is_mirrored) -> Maybe<Scope> {
+             const std::shared_ptr<Shape>& hierarchy, bool is_local) -> Maybe<Scope> {
            JobConfigProto job_conf;
            CHECK_OR_RETURN(TxtString2PbMessage(job_conf_str, &job_conf))
                << Error::RuntimeError() << "job conf parse failed";
            return builder->BuildInitialScope(session_id, job_conf, device_tag, machine_device_ids,
-                                              hierarchy, is_mirrored);
+                                              hierarchy, is_local);
          },
          py::arg("session_id").none(false), py::arg("job_conf_str").none(false),
          py::arg("device_tag").none(false), py::arg("machine_device_ids").none(false),
-          py::arg("hierarchy").none(true), py::arg("is_mirrored").none(false))
+          py::arg("hierarchy").none(true), py::arg("is_local").none(false))
      .def(
          "BuildInitialScopeWithPlacement",
          [](const std::shared_ptr<InstructionsBuilder>& builder, int64_t session_id,
             const std::string& job_conf_str, Symbol<ParallelDesc> placement,
-             bool is_mirrored) -> Maybe<Scope> {
+             bool is_local) -> Maybe<Scope> {
            JobConfigProto job_conf;
            CHECK_OR_RETURN(TxtString2PbMessage(job_conf_str, &job_conf))
                << Error::RuntimeError() << "job conf parse failed";
            return builder->BuildInitialScopeWithPlacement(session_id, job_conf, placement,
-                                                           is_mirrored);
+                                                           is_local);
          },
          py::arg("session_id").none(false), py::arg("job_conf_str").none(false),
-          py::arg("placement").none(false), py::arg("is_mirrored").none(false))
+          py::arg("placement").none(false), py::arg("is_local").none(false))
      .def("BuildScopeWithNewParallelDesc", &InstructionsBuilder::BuildScopeWithNewParallelDesc,
           py::arg("scope").none(false), py::arg("device_tag").none(false),
           py::arg("machine_device_ids").none(false), py::arg("hierarchy").none(true))
@@ -79,7 +79,7 @@ ONEFLOW_API_PYBIND11_MODULE("deprecated", m) {
                 << Error::RuntimeError() << "parallel conf parse failed";
             return builder->BuildScopeWithNewParallelConf(scope, parallel_conf);
           })
-      .def("BuildScopeWithNewIsMirrored", &InstructionsBuilder::BuildScopeWithNewIsMirrored)
+      .def("BuildScopeWithNewIsLocal", &InstructionsBuilder::BuildScopeWithNewIsLocal)
      .def("BuildScopeWithNewScopeName", &InstructionsBuilder::BuildScopeWithNewScopeName)
      .def("BuildScopeByProtoStrSetter", &InstructionsBuilder::BuildScopeByProtoStrSetter);


--- a/oneflow/api/python/framework/job_instance.cpp
+++ b/oneflow/api/python/framework/job_instance.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include <pybind11/pybind11.h>
-#include <string>
-#include <memory>
-#include "oneflow/api/python/of_api_registry.h"
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/job/job_instance.h"
-
-namespace py = pybind11;
-
-namespace oneflow {
-
-class PyJobInstance : public JobInstance {
- public:
-  // Inherit the constructors
-  using JobInstance::JobInstance;
-
-  // Trampoline (need one for each virtual function)
-  std::string job_name() const override {
-    PYBIND11_OVERRIDE(std::string, /* Return type */
-                      JobInstance, /* Parent class */
-                      job_name,    /* Name of function in C++ (must match Python name) */
-    );
-  }
-
-  std::string sole_input_op_name_in_user_job() const override {
-    PYBIND11_OVERRIDE(std::string, JobInstance, sole_input_op_name_in_user_job, );
-  }
-
-  std::string sole_output_op_name_in_user_job() const override {
-    PYBIND11_OVERRIDE(std::string, JobInstance, sole_output_op_name_in_user_job, );
-  }
-
-  void PushBlob(uint64_t ofblob_ptr) const override {
-    PYBIND11_OVERRIDE(void, JobInstance, PushBlob, ofblob_ptr);
-  }
-
-  void PullBlob(uint64_t ofblob_ptr) const override {
-    PYBIND11_OVERRIDE(void, JobInstance, PullBlob, ofblob_ptr);
-  }
-
-  void Finish() const override { PYBIND11_OVERRIDE(void, JobInstance, Finish, ); }
-};
-
-}  // namespace oneflow
-
-ONEFLOW_API_PYBIND11_MODULE("", m) {
-  using namespace oneflow;
-
-  py::class_<JobInstance, PyJobInstance, std::shared_ptr<JobInstance>>(m, "JobInstance")
-      .def(py::init<>())
-      .def("job_name", &JobInstance::job_name)
-      .def("sole_input_op_name_in_user_job", &JobInstance::sole_input_op_name_in_user_job)
-      .def("sole_output_op_name_in_user_job", &JobInstance::sole_output_op_name_in_user_job)
-      .def("PushBlob", &JobInstance::PushBlob)
-      .def("PullBlob", &JobInstance::PullBlob)
-      .def("Finish", &JobInstance::Finish);
-}
--- a/oneflow/api/python/framework/nn_graph.cpp
+++ b/oneflow/api/python/framework/nn_graph.cpp
@@ -86,19 +86,30 @@ ONEFLOW_API_PYBIND11_MODULE("nn.graph.", m) {
  m.def("RunLazyNNGraph", &RunLazyNNGraph);
  m.def("SoftSyncNNGraphBuffers", &SoftSyncNNGraphBuffers);
  m.def("AddTensorAsGraphLoss", &AddTensorAsGraphLoss);
+  m.def("MarkVariableGradients", [](const std::vector<std::shared_ptr<one::Tensor>>& variables,
+                                    const std::vector<std::shared_ptr<one::Tensor>>& gradients) {
+    one::TensorTuple variable_tuple(variables.size());
+    one::TensorTuple gradient_tuple(gradients.size());
+    for (int i = 0; i < variables.size(); ++i) { variable_tuple[i] = variables[i]; }
+    for (int i = 0; i < gradients.size(); ++i) { gradient_tuple[i] = gradients[i]; }
+    return MarkVariableGradients(variable_tuple, gradient_tuple);
+  });
  m.def("ConvertJobToTosaIR", [](const std::string& serialized_job) -> Maybe<std::string> {
    Job job;
-    CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job))
-        << "serialized job conversion failed.";
+    CHECK_OR_RETURN(job.ParseFromString(serialized_job)) << "serialized job conversion failed.";
    return ConvertJobToTosaIR(&job);
  });
-  m.def("SaveJobToIR",
-        [](const std::string& serialized_job, const std::string& path) -> Maybe<void> {
-          Job job;
-          CHECK_OR_RETURN(TxtString2PbMessage(serialized_job, &job))
-              << "serialized job conversion failed.";
-          return SaveJobToIR(&job, path);
-        });
+  m.def(
+      "SaveJobToIR", [](const std::string& serialized_job, const std::string& path) -> Maybe<void> {
+        Job job;
+        CHECK_OR_RETURN(job.ParseFromString(serialized_job)) << "serialized job conversion failed.";
+        return SaveJobToIR(&job, path);
+      });
+  m.def("ConvertJobToIR", [](const std::string& serialized_job) -> Maybe<std::string> {
+    Job job;
+    CHECK_OR_RETURN(job.ParseFromString(serialized_job)) << "serialized job conversion failed.";
+    return ConvertJobToIR(&job);
+  });
  m.def("LoadSerializedJobFromIR", [](const std::string& path) -> Maybe<py::bytes> {
    Job job;
    JUST(LoadJobFromIR(&job, path));

--- a/oneflow/api/python/framework/one_embedding.cpp
+++ b/oneflow/api/python/framework/one_embedding.cpp
@@ -222,6 +222,7 @@ class PersistentTableReaderImpl : public PersistentTableReader {
      options.value_size = storage_dim * sizeof(Value);
      options.target_chunk_size_mb = target_chunk_size_mb;
      options.physical_block_size = physical_block_size;
+      options.read_only = true;
      tables_[i] = NewPersistentTable(options);
      iterators_[i] =
          std::unique_ptr<PersistentTable::Iterator>(tables_[i]->ReadSnapshot(snapshot_name));