0.9.1-rocm

a715222c · yuguo · f262efc9 · a715222c · a715222c · a715222c
Commit a715222c authored Feb 28, 2023 by yuguo
20 changed files
--- a/oneflow/api/python/functional/python_arg_parser.cpp
+++ b/oneflow/api/python/functional/python_arg_parser.cpp
@@ -30,7 +30,7 @@ void FunctionSchema::ReportKwargsError(PyObject* kwargs, size_t nargs) const {
    int64_t index = -1;
    const std::string string_key = PyStringAsString(key);
    for (int i = 0; i < def_->argument_def.size(); ++i) {
-      const auto& arg = def_->argument_def.at(i);
+      const auto& arg = def_->argument_def[i];
      if (arg.name == string_key) {
        index = i;
        break;
@@ -56,7 +56,7 @@ bool FunctionSchema::Parse(PyObject* args, PyObject* kwargs, PythonArg* parsed_a
  size_t remaining_kwargs = kwargs ? PyDict_Size(kwargs) : 0;
  if (max_pos_nargs_ == 1) {
-    const auto& type = def_->argument_def.at(0).type;
+    const auto& type = def_->argument_def[0].type;
    treat_args_as_list = IsIntegralListType(type) || type == kSHAPE || type == kTENSOR_TUPLE;
  }
  if (nargs > max_pos_nargs_ && !treat_args_as_list) {
@@ -68,7 +68,7 @@ bool FunctionSchema::Parse(PyObject* args, PyObject* kwargs, PythonArg* parsed_a
  }
  int arg_pos = 0;
  for (int i = 0; i < def_->argument_def.size(); ++i) {
-    const auto& param = def_->argument_def.at(i);
+    const auto& param = def_->argument_def[i];
    PyObject* obj = NULL;
    if (args && arg_pos < nargs) {
      if (param.keyword_only) {
@@ -77,10 +77,10 @@ bool FunctionSchema::Parse(PyObject* args, PyObject* kwargs, PythonArg* parsed_a
        }
        return false;
      }
-      obj = PyTuple_GetItem(args, arg_pos);
+      obj = PyTuple_GET_ITEM(args, arg_pos);
    } else if (kwargs) {
      obj = PyDict_GetItemString(kwargs, param.name.c_str());
-      if (obj) { remaining_kwargs--; }
+      if (obj) { --remaining_kwargs; }
    }
    if (obj) {
@@ -89,10 +89,10 @@ bool FunctionSchema::Parse(PyObject* args, PyObject* kwargs, PythonArg* parsed_a
        obj = args;
        arg_pos = nargs;
      } else {
-        arg_pos++;
+        ++arg_pos;
      }
      PythonArg arg(obj, param.size);
-      if ((obj == Py_None && param.optional) || PythonArgCheck(arg, param.type)) {
+      if ((obj == Py_None && param.optional) || arg.TypeCheck(param.type)) {
        parsed_args[i] = arg;
      } else {
        if (raise_exception) {
@@ -109,7 +109,7 @@ bool FunctionSchema::Parse(PyObject* args, PyObject* kwargs, PythonArg* parsed_a
        }
        return false;
      }
-      parsed_args[i] = param.default_value;
+      parsed_args[i] = param.default_value.get();
    }
  }
  if (remaining_kwargs > 0) {

--- a/oneflow/api/python/functional/python_frame.h
+++ b/oneflow/api/python/functional/python_frame.h
@@ -17,33 +17,75 @@ limitations under the License.
 #define ONEFLOW_API_PYTHON_FUNCTIONAL_PYTHON_FRAME_H_
 #include <Python.h>
+#include <cstdint>
+#include <string>
+#include <vector>
 #include "oneflow/api/python/functional/common.h"
 #include "oneflow/core/framework/op_interpreter/dispatch_frame.h"
 #include "oneflow/core/job/graph_scope_vars.h"
+#include "oneflow/core/profiler/profiler.h"
 namespace oneflow {
 namespace one {
 namespace functional {
 namespace {
-std::string get_cur_frame_stack_str(int32_t max_stack_depth) {
-  std::string cur_f_str;
+// get a formatted stack frame representation
-  PyFrameObject* cur_frame = PyEval_GetFrame();
+// example: Python Stack[-10]: '__call__' at '.../graph/graph.py': line 219
-  for (int32_t i = 0; i < max_stack_depth; i++) {
+std::string get_python_frame_str_repr(int32_t stack_index, PyFrameObject* frame) {
-    if (cur_frame == NULL) break;
+  if (frame == NULL) return "";
-    const int32_t stack_index = (-1) * i - 1;
+  PyCodeObject* code = frame->f_code;
-    cur_f_str = "Python Stack[" + std::to_string(stack_index)
+  std::string repr = "Python Stack[" + std::to_string(stack_index) + "]: ";
-                + "]: " + PyObjectToReprStr((PyObject*)cur_frame) + "; " + cur_f_str;
+  std::string file_name = PyObjectToReprStr(code->co_filename);
-    cur_frame = cur_frame->f_back;
+  std::string code_name = PyObjectToReprStr(code->co_name);
+  int line_number = PyFrame_GetLineNumber(frame);
+  return repr + code_name + " at " + file_name + ": line " + std::to_string(line_number) + "; ";
+}
+bool check_if_python_file_should_be_filtered(const std::string& path) {
+  const auto& paths_to_be_kept = GetPythonPathsToBeKeptForDebugging();
+  for (int i = 0; i < paths_to_be_kept.size(); ++i) {
+    const std::string& path_to_be_kept = paths_to_be_kept[i];
+    if (path.size() > path_to_be_kept.size()) {
+      if (path.substr(0, path_to_be_kept.size()) == path_to_be_kept) { return false; }
+    }
  }
-  return cur_f_str;
+  const auto& paths_to_be_filtered = GetPythonPathsToBeFilteredForDebugging();
+  for (int i = 0; i < paths_to_be_filtered.size(); ++i) {
+    const std::string& path_to_be_filtered = paths_to_be_filtered[i];
+    if (path.size() > path_to_be_filtered.size()) {
+      if (path.substr(0, path_to_be_filtered.size()) == path_to_be_filtered) { return true; }
+    }
+  }
+  return false;
+}
+bool check_if_frame_should_be_filtered(PyFrameObject* frame) {
+  std::string frame_file_name = PyObjectToReprStr(frame->f_code->co_filename);
+  frame_file_name = frame_file_name.substr(1, frame_file_name.size() - 2);  // get rid of ' '
+  return check_if_python_file_should_be_filtered(frame_file_name);
+}
+bool check_if_should_skip_this_frame(PyFrameObject* frame) {
+  const bool only_user_py_stack = GetGraphDebugOnlyUserPyStack();
+  if (only_user_py_stack) { return check_if_frame_should_be_filtered(frame); }
+  return false;
 }
 int32_t get_cur_stack_depth() {
  int32_t current_stack_depth = 0;
  PyFrameObject* f = PyEval_GetFrame();
  while (f) {
+    if (check_if_should_skip_this_frame(f)) {
+      f = f->f_back;
+      continue;
+    }
    current_stack_depth++;
    f = f->f_back;
  }
@@ -51,20 +93,40 @@ int32_t get_cur_stack_depth() {
 }
 std::string get_cur_frame_stack_str() {
-  const bool debug_mode = GetGraphDebugMode();
  const int32_t max_stack_depth = GetGraphDebugMaxPyStackDepth();
-  if (debug_mode) {  // show more info for the stack trace in debug mode
+  std::string cur_f_str;
-    int32_t current_stack_depth = get_cur_stack_depth();
+  PyFrameObject* cur_frame = PyEval_GetFrame();
-    std::string cur_f_str = get_cur_frame_stack_str(max_stack_depth);
-    if (current_stack_depth > max_stack_depth) {  // show how many stack depth remaining to be shown
+  int i = 0;
-      int32_t remaining_stack_depth = current_stack_depth - max_stack_depth;
+  while (i < max_stack_depth) {
-      cur_f_str += " ... " + std::to_string(remaining_stack_depth) + " more; ";
+    if (cur_frame == NULL) break;
+    const int32_t stack_index = (-1) * i - 1;
+    if (check_if_should_skip_this_frame(cur_frame)) {
+      cur_frame = cur_frame->f_back;
+      continue;
+    }
+    i++;
+    cur_f_str = get_python_frame_str_repr(stack_index, cur_frame) + cur_f_str;
+    cur_frame = cur_frame->f_back;
+  }
+  const bool debug_mode =
+      GetGraphDebugMode();  // show how may stack frames remain to be shown in debug mode
+  if (debug_mode) {
+    const int32_t current_stack_depth = get_cur_stack_depth();
+    if (current_stack_depth > max_stack_depth) {
+      cur_f_str += "... " + std::to_string(current_stack_depth - max_stack_depth) + " more";
    }
-    return cur_f_str;
+  } else {
+    if (cur_frame != NULL) { cur_f_str += " ... more"; }
  }
-  return get_cur_frame_stack_str(max_stack_depth);
+  return cur_f_str;
 }
 }  // namespace
 class PythonFrameGuard {

--- a/oneflow/api/python/functional/python_return_types.h
+++ b/oneflow/api/python/functional/python_return_types.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+// This code is referenced from:
+// https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/structseq.cpp
+#ifndef ONEFLOW_API_PYTHON_FUNCTIONAL_PYTHON_RETURN_TYPES_H_
+#define ONEFLOW_API_PYTHON_FUNCTIONAL_PYTHON_RETURN_TYPES_H_
+#include <Python.h>
+#include <string>
+#include <sstream>
+#include <structmember.h>
+#include "oneflow/api/python/exception/exception.h"
+#include "oneflow/api/python/functional/common.h"
+namespace oneflow {
+namespace one {
+namespace functional {
+PyObject* returned_structseq_repr(PyStructSequence* obj) {
+  HANDLE_ERRORS
+  PyTypeObject* tp = Py_TYPE(obj);
+  PyObjectPtr tuple((PyObject*)obj);
+  if (tuple == nullptr) { return nullptr; }
+  std::stringstream ss;
+  ss << tp->tp_name << "(\n";
+  Py_ssize_t num_elements = Py_SIZE(obj);
+  for (Py_ssize_t i = 0; i < num_elements; i++) {
+    const char* cname = tp->tp_members[i].name;
+    if (cname == nullptr) {
+      PyErr_Format(PyExc_SystemError,
+                   "In structseq_repr(), member %zd name is nullptr"
+                   " for type %.500s",
+                   i, tp->tp_name);
+      return nullptr;
+    }
+    PyObject* val = PyTuple_GetItem(tuple.get(), i);
+    if (val == nullptr) { return nullptr; }
+    auto repr = PyObjectPtr(PyObject_Repr(val));
+    if (repr == nullptr) { return nullptr; }
+    const char* crepr = PyUnicode_AsUTF8(repr.get());
+    if (crepr == nullptr) { return nullptr; }
+    ss << cname << '=' << crepr;
+    if (i < num_elements - 1) { ss << ",\n"; }
+  }
+  ss << ")";
+  return PyUnicode_FromString(ss.str().c_str());
+  END_HANDLE_ERRORS
+}
+}  // namespace functional
+}  // namespace one
+}  // namespace oneflow
+#endif  // ONEFLOW_API_PYTHON_FUNCTIONAL_PYTHON_RETURN_TYPES_H_
--- a/oneflow/api/python/functional/tensor_api.cpp
+++ b/oneflow/api/python/functional/tensor_api.cpp
@@ -17,6 +17,7 @@ limitations under the License.
 #include <memory>
 #include "oneflow/api/python/utils/tensor_utils.h"
+#include "oneflow/api/python/dlpack/converter.h"
 #include "oneflow/api/python/framework/size.h"
 #include "oneflow/api/python/functional/common.h"
 #include "oneflow/api/python/functional/tensor_api.yaml.h"
@@ -50,6 +51,11 @@ class TensorWithDataFunctor {
    //  even if in nn.Graph build (module forward function), if you create a flow.Tensor,
    //  its a eager tensor by Run functional::Empty() in LazyMode::Grad(false)
    LazyMode::Guard lazy_mode_disabled_guard(/*is_enabled*/ false);
+    if (GlobalMode::is_enabled()) {
+      return JUST(
+          functional::GlobalTensorWithData(data, dtype, GetGlobalParallelDescFromDevice(device),
+                                           *JUST(GetSbpList(GlobalMode::nd_sbp())), requires_grad));
+    }
    if (PyTensor_Check(data)) {
      // Throw warnings like pytorch.
@@ -70,7 +76,7 @@ class TensorWithDataFunctor {
  }
 };
-class ConsistentTensorWithDataFunctor {
+class GlobalTensorWithDataFunctor {
 public:
  Maybe<Tensor> operator()(PyObject* data, const Optional<Symbol<DType>>& dtype,
                           const Symbol<ParallelDesc>& placement,
@@ -93,8 +99,8 @@ class ConsistentTensorWithDataFunctor {
      const auto& other = PyTensor_Unpack(data);
      return MakeTensorFromOtherTensor(other, dtype, placement, sbp_tuple, requires_grad);
    }
-    // Make consistent tensor from python sequence or numpy array.
+    // Make global tensor from python sequence or numpy array.
-    return MakeConsistentTensorFromData(data, dtype, placement, sbp_tuple, requires_grad);
+    return MakeGlobalTensorFromData(data, dtype, placement, sbp_tuple, requires_grad);
  }
 };
@@ -106,13 +112,13 @@ class TensorEmptyCtorFunctor {
  }
 };
-class ConsistentTensorEmptyCtorFunctor {
+class GlobalTensorEmptyCtorFunctor {
 public:
  Maybe<Tensor> operator()(const Symbol<ParallelDesc>& placement,
                           const std::vector<Symbol<SbpParallel>>& sbp_tuple) const {
    Shape shape(DimVector{0});
    JUST(CheckDeviceIdsIsValid(placement));
-    return ConsistentTensorWithShapeCtor(shape, placement, sbp_tuple);
+    return GlobalTensorWithShapeCtor(shape, placement, sbp_tuple);
  }
 };
@@ -122,7 +128,7 @@ class TensorWithOtherCtorFunctor {
    // NOTE(chengcheng): flow.Tensor or flow.tensor ONLY created by EagerTensor now.
    LazyMode::Guard lazy_mode_disabled_guard(/*is_enabled*/ false);
    bool is_pinned = false;
-    if (other->is_local()) { is_pinned = JUST(CHECK_JUST(other->AsMirroredTensor())->is_pinned()); }
+    if (other->is_local()) { is_pinned = JUST(CHECK_JUST(other->AsLocalTensor())->is_pinned()); }
    return MakeTensorFromOtherTensor(other, is_pinned);
  }
 };
@@ -141,11 +147,11 @@ class TensorWithDataCtorFunctor {
    // NOTE(chengcheng): flow.Tensor or flow.tensor ONLY created by EagerTensor now.
    LazyMode::Guard lazy_mode_disabled_guard(/*is_enabled*/ false);
-    const auto& dtype = DType::Float();
+    const auto& dtype = GetDefaultDType();
    if (PyTensor_Check(data)) {
      const auto& other = PyTensor_Unpack(data);
      const bool pin_memory =
-          other->is_local() ? JUST(JUST(other->AsMirroredTensor())->is_pinned()) : false;
+          other->is_local() ? JUST(JUST(other->AsLocalTensor())->is_pinned()) : false;
      return MakeTensorFromOtherTensor(other, dtype, device,
                                       /*requires_grad=*/false, /*pin_memory=*/pin_memory);
    }
@@ -155,7 +161,7 @@ class TensorWithDataCtorFunctor {
  }
 };
-class ConsistentTensorWithDataCtorFunctor {
+class GlobalTensorWithDataCtorFunctor {
 public:
  Maybe<Tensor> operator()(PyObject* data, const Symbol<ParallelDesc>& placement,
                           const std::vector<Symbol<SbpParallel>>& sbp_tuple) const {
@@ -164,23 +170,23 @@ class ConsistentTensorWithDataCtorFunctor {
    if (PyLong_Check(data)) {
      int64_t size = PyLong_AsLongLong(data);
      Shape shape(DimVector{size});
-      return ConsistentTensorWithShapeCtor(shape, placement, sbp_tuple);
+      return GlobalTensorWithShapeCtor(shape, placement, sbp_tuple);
    }
    if (TensorSize_Check(data)) {
-      return ConsistentTensorWithShapeCtor(TensorSize_AsShape(data), placement, sbp_tuple);
+      return GlobalTensorWithShapeCtor(TensorSize_AsShape(data), placement, sbp_tuple);
    }
    // NOTE(chengcheng): flow.Tensor or flow.tensor ONLY created by EagerTensor now.
    LazyMode::Guard lazy_mode_disabled_guard(/*is_enabled*/ false);
-    const auto& dtype = DType::Float();
+    const auto& dtype = GetDefaultDType();
    if (PyTensor_Check(data)) {
      const auto& other = PyTensor_Unpack(data);
      return MakeTensorFromOtherTensor(other, dtype, placement, sbp_tuple,
                                       /*requires_grad=*/false);
    }
-    // Make consistent tensor from python sequence or numpy array.
+    // Make global tensor from python sequence or numpy array.
-    return MakeConsistentTensorFromData(data, dtype, placement, sbp_tuple, /*requires_grad=*/false);
+    return MakeGlobalTensorFromData(data, dtype, placement, sbp_tuple, /*requires_grad=*/false);
  }
 };
@@ -195,18 +201,18 @@ class TensorWithShapeCtorFunctor {
    } else {
      device_ = JUST(Device::New("cpu"));
    }
-    return functional::Empty(shape, DType::Float(), device_, /*pin_memory=*/false);
+    return functional::Empty(shape, GetDefaultDType(), device_, /*pin_memory=*/false);
  }
 };
-class ConsistentTensorWithShapeCtorFunctor {
+class GlobalTensorWithShapeCtorFunctor {
 public:
  Maybe<Tensor> operator()(const Shape& shape, const Symbol<ParallelDesc>& placement,
                           const std::vector<Symbol<SbpParallel>>& sbp_tuple) const {
    // NOTE(chengcheng): flow.Tensor or flow.tensor ONLY created by EagerTensor now.
    LazyMode::Guard lazy_mode_disabled_guard(/*is_enabled*/ false);
    JUST(CheckDeviceIdsIsValid(placement));
-    return functional::ConsistentEmpty(shape, DType::Float(), placement, sbp_tuple);
+    return functional::GlobalEmpty(shape, GetDefaultDType(), placement, sbp_tuple);
  }
 };
@@ -220,7 +226,9 @@ class AssignLocalTensorFunctor {
    // JUST(CheckInplaceValid(ref)); // align check to torch
    CHECK_OR_RETURN(ref->is_local() && value->is_local())
        << "Both ref and value must be local tensor.";
-    JUST(OpInterpUtil::Dispatch<TensorTuple>(*op_, {ref, value}));
+    std::shared_ptr<one::Tensor> src = value;
+    if (ref->dtype() != src->dtype()) { src = JUST(To(src, ref->dtype(), false)); }
+    JUST(OpInterpUtil::Dispatch<TensorTuple>(*op_, {ref, src}));
    return Maybe<void>::Ok();
  }
@@ -228,6 +236,35 @@ class AssignLocalTensorFunctor {
  std::shared_ptr<OpExpr> op_;
 };
+static std::vector<int64_t> get_shape_or_stride_from_numpy(size_t ndim, npy_intp* values) {
+  auto result = std::vector<int64_t>(ndim);
+  for (size_t i = 0; i < ndim; ++i) { result[i] = static_cast<int64_t>(values[i]); }
+  return result;
+}
+class LocalTensorSharedDlPackDataFunctor {
+ public:
+  LocalTensorSharedDlPackDataFunctor() {}
+  Maybe<Tensor> operator()(PyObject* obj) const {
+    DLManagedTensor* dlMTensor = (DLManagedTensor*)PyCapsule_GetPointer(obj, "dltensor");
+    CHECK_NOTNULL_OR_RETURN(dlMTensor)
+        << "from_dlpack received an invalid capsule. "
+           "Note that DLTensor capsules can be consumed only once, "
+           "so you might have already constructed a tensor from it once.";
+    // `tensor` steals the ownership of the underlying storage. It also passes a
+    // destructor function that will be called when the underlying storage goes
+    // out of scope. When the destructor is called, the dlMTensor is destructed
+    // too.
+    auto tensor = fromDLPack(dlMTensor);
+    // Make sure this capsule will never be used again.
+    PyCapsule_SetName(obj, "used_dltensor");
+    return tensor;
+  }
+};
 class LocalTensorSharedNumpyDataFunctor {
 public:
  LocalTensorSharedNumpyDataFunctor() {}
@@ -236,37 +273,45 @@ class LocalTensorSharedNumpyDataFunctor {
      return Error::TypeError() << "expected np.ndarray, but got " << Py_TYPE(obj)->tp_name;
    }
    auto* array = reinterpret_cast<PyArrayObject*>(obj);
-    // TODO(wyg): support non-contiguous array.
+    const size_t ndim = PyArray_NDIM(array);
-    if (!PyArray_IS_C_CONTIGUOUS(array)) {
+    std::vector<int64_t> sizes = get_shape_or_stride_from_numpy(ndim, PyArray_DIMS(array));
-      OF_LOG_ONCE(LOG(WARNING) << "OneFlow don't support non-contiguous array now, "
+    std::vector<int64_t> strides = get_shape_or_stride_from_numpy(ndim, PyArray_STRIDES(array));
-                                  "and we will copy the array to a contiguous one.");
+    // NumPy strides use bytes. OneFlow strides use element counts.
-      // PyArray_GETCONTIGUOUS will return a reference if array is already contiguous,
+    // These checks are consistent with pytorch(v1.10.0):
-      // otherwise return a (contiguous) copy of the array.
+    // https://github.com/pytorch/pytorch/blob/v1.10.0/torch/csrc/utils/tensor_numpy.cpp#L171
-      // Note: Increment the reference count for array occurs whether the array is continuous or not
+    const auto element_size_in_bytes = PyArray_ITEMSIZE(array);
-      array = PyArray_GETCONTIGUOUS(array);
+    for (auto& stride : strides) {
-    } else {
+      if (stride % element_size_in_bytes != 0) {
-      Py_INCREF(obj);
+        return Error::InvalidValueError()
+               << "given numpy array strides not a multiple of the element byte size. "
+               << "Copy the numpy array to reallocate the memory.";
+      }
+      stride /= element_size_in_bytes;
    }
+    for (size_t i = 0; i < ndim; ++i) {
+      if (strides[i] < 0) {
+        return Error::InvalidValueError()
+               << "At least one stride in the given numpy array is negative, "
+               << "and tensors with negative strides are not currently supported. "
+               << "(You can probably work around this by making a copy of your array "
+               << " with array.copy().) ";
+      }
+    }
+    void* data_ptr = PyArray_DATA(array);
+    if (!PyArray_EquivByteorders(PyArray_DESCR(array)->byteorder, NPY_NATIVE)) {
+      return Error::InvalidValueError()
+             << "given numpy array has byte order different from the native byte order. "
+             << "Conversion between byte orders is currently not supported.";
+    }
+    Py_INCREF(obj);
    // Build TensorMeta
-    int32_t dim = PyArray_NDIM(array);
+    const auto shape = Shape(DimVector(sizes.begin(), sizes.end()));
-    const npy_intp* dims_ptr = PyArray_SHAPE(array);
+    const auto stride = Stride(strides.begin(), strides.end());
-    const auto shape = std::make_shared<Shape>(DimVector(dims_ptr, dims_ptr + dim));
    DataType data_type = JUST(numpy::GetOFDataTypeFromNpArray(array));
    Symbol<Device> device = JUST(Device::New("cpu"));
-    const npy_intp* stride_ptr = PyArray_STRIDES(array);
-    // stride
+    auto tensor_meta = SymbolOf(LocalTensorMeta(shape, stride, data_type, device));
-    auto strides = std::make_shared<Stride>(stride_ptr, stride_ptr + dim);
-    auto element_size_in_bytes = PyArray_ITEMSIZE(array);
-    // NumPy strides use bytes. OneFlow strides use element counts.
-    for (auto& stride_val : *strides) {
-      if (stride_val % element_size_in_bytes != 0) {
-        return Error::RuntimeError() << "given numpy array strides not a multiple of the element "
-                                        "byte size. Copy the numpy array to reallocate the memory.";
-      }
-      stride_val /= element_size_in_bytes;
-    }
-    auto tensor_meta = std::make_shared<MirroredTensorMeta>(shape, strides, data_type, device, 0);
    // Build TensorBuffer
    const auto& Free = [array](char* dptr) {
@@ -275,9 +320,9 @@ class LocalTensorSharedNumpyDataFunctor {
        return Maybe<void>::Ok();
      }));
    };
-    void* data_ptr = PyArray_DATA(array);
-    auto array_size_in_bytes = PyArray_NBYTES(array);
+    const auto array_size_in_bytes = PyArray_NBYTES(array);
-    auto tensor_data = std::make_shared<vm::TensorStorage>();
+    auto tensor_data = std::make_shared<vm::OutsideVmTensorStorage>();
    tensor_data->set_blob_dptr(
        std::unique_ptr<char, std::function<void(char*)>>(static_cast<char*>(data_ptr), Free),
        array_size_in_bytes);
@@ -286,17 +331,17 @@ class LocalTensorSharedNumpyDataFunctor {
    auto tensor_storage = std::make_shared<TensorStorage>(tensor_data);
    // Build Tensor
-    auto tensor_impl = std::make_shared<EagerMirroredTensorImpl>(tensor_meta, tensor_storage,
+    auto tensor_impl = std::make_shared<EagerLocalTensorImpl>(tensor_storage,
-                                                                 /*requires_grad=*/false,
+                                                              /*requires_grad=*/false,
-                                                                 /*ls_leaf=*/true);
+                                                              /*ls_leaf=*/true);
    // Init blob
-    JUST(tensor_impl->InitEagerBlobObject(NewLocalDepObject()));
+    JUST(tensor_impl->InitEagerBlobObject(tensor_meta, NewLocalDepObject()));
    const auto& stream = JUST(GetDefaultStreamByDevice(device));
    const auto& eager_blob_object = JUST(tensor_impl->eager_blob_object());
    JUST(eager_blob_object->init_producer_stream(stream));
    eager_blob_object->set_last_used_stream(stream);
-    std::shared_ptr<Tensor> out(new MirroredTensor(tensor_impl));
+    std::shared_ptr<Tensor> out(new LocalTensor(tensor_impl));
    return out;
  }
 };
@@ -305,16 +350,17 @@ class LocalTensorSharedNumpyDataFunctor {
 ONEFLOW_FUNCTION_LIBRARY(m) {
  m.add_functor<impl::TensorWithDataFunctor>("TensorWithData");
-  m.add_functor<impl::ConsistentTensorWithDataFunctor>("ConsistentTensorWithData");
+  m.add_functor<impl::GlobalTensorWithDataFunctor>("GlobalTensorWithData");
  m.add_functor<impl::TensorEmptyCtorFunctor>("TensorEmptyCtor");
-  m.add_functor<impl::ConsistentTensorEmptyCtorFunctor>("ConsistentTensorEmptyCtor");
+  m.add_functor<impl::GlobalTensorEmptyCtorFunctor>("GlobalTensorEmptyCtor");
  m.add_functor<impl::TensorWithOtherCtorFunctor>("TensorWithOtherCtor");
  m.add_functor<impl::TensorWithDataCtorFunctor>("TensorWithDataCtor");
-  m.add_functor<impl::ConsistentTensorWithDataCtorFunctor>("ConsistentTensorWithDataCtor");
+  m.add_functor<impl::GlobalTensorWithDataCtorFunctor>("GlobalTensorWithDataCtor");
  m.add_functor<impl::TensorWithShapeCtorFunctor>("TensorWithShapeCtor");
-  m.add_functor<impl::ConsistentTensorWithShapeCtorFunctor>("ConsistentTensorWithShapeCtor");
+  m.add_functor<impl::GlobalTensorWithShapeCtorFunctor>("GlobalTensorWithShapeCtor");
  m.add_functor<impl::AssignLocalTensorFunctor>("AssignLocalTensor");
  m.add_functor<impl::LocalTensorSharedNumpyDataFunctor>("LocalTensorSharedNumpyData");
+  m.add_functor<impl::LocalTensorSharedDlPackDataFunctor>("LocalTensorSharedDlPackData");
 }
 }  // namespace functional

--- a/oneflow/api/python/functional/tensor_api.yaml
+++ b/oneflow/api/python/functional/tensor_api.yaml
@@ -17,7 +17,7 @@
      "Tensor (PyObject* data, *, DataType dtype=None, Device device=None,
      Bool requires_grad=False, Bool pin_memory=False) => TensorWithData",
      "Tensor (PyObject* data, *, DataType dtype=None, Placement placement,
-      SbpList sbp, Bool requires_grad=False) => ConsistentTensorWithData",
+      SbpList sbp, Bool requires_grad=False) => GlobalTensorWithData",
    ]
  bind_python: True
@@ -25,12 +25,12 @@
  signature:
    [
      "Tensor (*, Device device=None) => TensorEmptyCtor",
-      "Tensor (*, Placement placement, SbpList sbp) => ConsistentTensorEmptyCtor",
+      "Tensor (*, Placement placement, SbpList sbp) => GlobalTensorEmptyCtor",
      "Tensor (Tensor other) => TensorWithOtherCtor",
      "Tensor (PyObject* data, *, Device device=None) => TensorWithDataCtor",
-      "Tensor (PyObject* data, *, Placement placement, SbpList sbp) => ConsistentTensorWithDataCtor",
+      "Tensor (PyObject* data, *, Placement placement, SbpList sbp) => GlobalTensorWithDataCtor",
      "Tensor (Shape size, *, Device device=None) => TensorWithShapeCtor",
-      "Tensor (Shape size, *, Placement placement, SbpList sbp) => ConsistentTensorWithShapeCtor",
+      "Tensor (Shape size, *, Placement placement, SbpList sbp) => GlobalTensorWithShapeCtor",
    ]
  bind_python: True
@@ -41,3 +41,7 @@
 - name: "from_numpy"
  signature: "Tensor (PyObject* obj) => LocalTensorSharedNumpyData"
  bind_python: True
+- name: "from_dlpack"
+  signature: "Tensor (PyObject* obj) => LocalTensorSharedDlPackData"
+  bind_python: True
--- a/oneflow/api/python/functional/value_types.h
+++ b/oneflow/api/python/functional/value_types.h
@@ -94,6 +94,7 @@ enum ValueType : int {
  kATTR_REF,
  kDTYPE,
  kSHAPE,
+  kSHAPE_MAYBE,
  kGENERATOR,
  kGENERATOR_REF,
  kGENERATOR_MAYBE,
@@ -152,6 +153,7 @@ VALUE_TYPE_OF_IMPL(Maybe<one::TensorTuple>, kTENSOR_TUPLE_MAYBE);
 VALUE_TYPE_OF_IMPL(Symbol<DType>, kDTYPE);
 VALUE_TYPE_OF_IMPL(std::vector<Symbol<DType>>, kDTYPE_LIST);
 VALUE_TYPE_OF_IMPL(Shape, kSHAPE);
+VALUE_TYPE_OF_IMPL(Maybe<Shape>, kSHAPE_MAYBE);
 VALUE_TYPE_OF_IMPL(std::vector<Shape>, kSHAPE_LIST);
 VALUE_TYPE_OF_IMPL(one::Generator, kGENERATOR);
 VALUE_TYPE_OF_IMPL(std::shared_ptr<one::Generator>, kGENERATOR_REF);

--- a/oneflow/api/python/ir.cpp
+++ b/oneflow/api/python/ir.cpp
@@ -13,19 +13,109 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#include "oneflow/core/common/singleton.h"
+#include "oneflow/ir/oneflow-extension/include/PyAst/Ast.h"
+#include <llvm/IR/IntrinsicsS390.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+#include <pybind11/stl.h>
+#include <algorithm>
+#include <string>
+#include <iostream>
+#include <tuple>
+#include <vector>
 #ifdef WITH_MLIR
 #include "oneflow/ir/include/OneFlow/Extension.h"
 #include "oneflow/ir/oneflow-extension/include/OneFlow/OneFlowRoundTrip.h"
-#include <glog/logging.h>
+#include "oneflow/ir/oneflow-extension/include/OneFlow/OneFlowLRJITRegistry.h"
 #include "oneflow/api/python/of_api_registry.h"
+#include <glog/logging.h>
+#include <functional>
+#include <utility>
 namespace oneflow {
 ONEFLOW_API_PYBIND11_MODULE("ir", m) {
  m.def("load_jit_shared_lib",
        [](const std::string& lib_path) { MutSharedLibPaths()->insert(lib_path); });
+  // TODO: this may be move to a common place for create global singleton.
+  m.def("create_global_lr_jit", []() { Singleton<LRJITRegistry>::New(); });
+  m.def("compile_and_register_lr_jit", [](const std::string& function_id,
+                                          std::shared_ptr<pyast::FunctionDef>& func, bool is_dump) {
+    Singleton<LRJITRegistry>::Get()->Register(function_id, *func.get(), is_dump);
+  });
+  // look up and execute the registered function for python api
+  m.def("get_lr", [](const std::string& function_id, float base_lr, float step) {
+    auto engine = Singleton<LRJITRegistry>::Get()->LookUp(function_id);
+    return engine(base_lr, step);
+  });
+  pybind11::class_<pyast::stmt, std::shared_ptr<pyast::stmt>>(m, "smt");
+  pybind11::class_<pyast::expr, std::shared_ptr<pyast::expr>>(m, "expr");
+  pybind11::class_<pyast::FunctionDef, pyast::stmt, std::shared_ptr<pyast::FunctionDef>>(
+      m, "FunctionDef");
+  m.def("FunctionDef_", &pyast::FunctionDef::FunctionDef_);
+  pybind11::class_<pyast::Return, pyast::stmt, std::shared_ptr<pyast::Return>>(m, "Return");
+  m.def("Return_", &pyast::Return::Return_);
+  pybind11::class_<pyast::Assign, pyast::stmt, std::shared_ptr<pyast::Assign>>(m, "Assign");
+  m.def("Assign_", &pyast::Assign::Assign_);
+  pybind11::class_<pyast::If, pyast::stmt, std::shared_ptr<pyast::If>>(m, "If");
+  m.def("If_", &pyast::If::If_);
+  pybind11::class_<pyast::Raise, pyast::stmt, std::shared_ptr<pyast::Raise>>(m, "Raise");
+  m.def("Raise_", &pyast::Raise::Raise_);
+  pybind11::class_<pyast::Assert, pyast::stmt, std::shared_ptr<pyast::Assert>>(m, "Assert");
+  m.def("Assert_", &pyast::Assert::Assert_);
+  pybind11::class_<pyast::Expr, pyast::stmt, std::shared_ptr<pyast::Expr>>(m, "Expr");
+  m.def("Expr_", &pyast::Expr::Expr_);
+  pybind11::class_<pyast::BoolOp, pyast::expr, std::shared_ptr<pyast::BoolOp>>(m, "BoolOp");
+  m.def("BoolOp_", &pyast::BoolOp::BoolOp_);
+  pybind11::class_<pyast::BinOp, pyast::expr, std::shared_ptr<pyast::BinOp>>(m, "BinOp");
+  m.def("BinOp_", &pyast::BinOp::BinOp_);
+  pybind11::class_<pyast::Lambda, pyast::expr, std::shared_ptr<pyast::Lambda>>(m, "Lambda");
+  m.def("Lambda_", &pyast::Lambda::Lambda_);
+  pybind11::class_<pyast::Compare, pyast::expr, std::shared_ptr<pyast::Compare>>(m, "Compare");
+  m.def("Compare_", &pyast::Compare::Compare_);
+  pybind11::class_<pyast::Call, pyast::expr, std::shared_ptr<pyast::Call>>(m, "Call");
+  m.def("Call_", &pyast::Call::Call_);
+  pybind11::class_<pyast::Num, pyast::expr, std::shared_ptr<pyast::Num>>(m, "Num");
+  m.def("Num_", &pyast::Num::Num_);
+  pybind11::class_<pyast::Constant, pyast::expr, std::shared_ptr<pyast::Constant>>(m, "Constant");
+  m.def("Constant_", &pyast::Constant::Constant_);
+  pybind11::class_<pyast::Attribute, pyast::expr, std::shared_ptr<pyast::Attribute>>(m,
+                                                                                     "Attribute");
+  m.def("Attribute_", &pyast::Attribute::Attribute_);
+  pybind11::class_<pyast::Name, pyast::expr, std::shared_ptr<pyast::Name>>(m, "Name");
+  m.def("Name_", &pyast::Name::Name_);
+  pybind11::class_<pyast::arguments, std::shared_ptr<pyast::arguments>>(m, "arguments");
+  m.def("arguments_", &pyast::arguments::arguments_);
+  pybind11::class_<pyast::arg, std::shared_ptr<pyast::arg>>(m, "arg");
+  m.def("arg_", &pyast::arg::arg_);
 }
 }  // namespace oneflow

--- a/oneflow/api/python/job_build/job_build_and_infer.cpp
+++ b/oneflow/api/python/job_build/job_build_and_infer.cpp
@@ -22,51 +22,50 @@ namespace py = pybind11;
 namespace oneflow {
+Maybe<void> MarkVariableGradients(const one::TensorTuple& variables,
+                                  const one::TensorTuple& gradients) {
+  CHECK_OR_RETURN(LazyMode::is_enabled());                 // NOLINT(maybe-need-error-msg)
+  CHECK_EQ_OR_RETURN(variables.size(), gradients.size());  // NOLINT(maybe-need-error-msg)
+  HashMap<std::string, std::string> variable_grad_lbns;
+  for (int i = 0; i < variables.size(); ++i) {
+    const std::string& variable_lbn = one::TensorNameScope::Global()->Lookup(variables[i]);
+    CHECK_OR_RETURN(!variable_lbn.empty())
+        << "variable which index is " << i << " expected to have a tensor name";
+    const std::string& gradient_lbn = one::TensorNameScope::Global()->Lookup(gradients[i]);
+    CHECK_OR_RETURN(!gradient_lbn.empty())
+        << "gradient which index is " << i << " expected to have a tensor name";
+    variable_grad_lbns.emplace(variable_lbn, gradient_lbn);
+  }
+  return JUST(GetCurInferCtx())->MarkVariableGradientBlobNames(variable_grad_lbns);
+}
+Maybe<void> MarkOutputGradients(const one::TensorTuple& outputs,
+                                const one::TensorTuple& gradients) {
+  CHECK_OR_RETURN(LazyMode::is_enabled());               // NOLINT(maybe-need-error-msg)
+  CHECK_EQ_OR_RETURN(outputs.size(), gradients.size());  // NOLINT(maybe-need-error-msg)
+  HashMap<std::string, std::string> output_gradient_lbns;
+  for (int i = 0; i < outputs.size(); ++i) {
+    const std::string& output_lbn = one::TensorNameScope::Global()->Lookup(outputs[i]);
+    CHECK_OR_RETURN(!output_lbn.empty())
+        << "output which index is " << i << " expected to have a tensor name";
+    const std::string& gradient_lbn = one::TensorNameScope::Global()->Lookup(gradients[i]);
+    CHECK_OR_RETURN(!gradient_lbn.empty())
+        << "gradient which index is " << i << " expected to have a tensor name";
+    output_gradient_lbns.emplace(output_lbn, gradient_lbn);
+  }
+  return JUST(GetCurInferCtx())->MarkOutputGradientBlobNames(output_gradient_lbns);
+}
 ONEFLOW_API_PYBIND11_MODULE("", m) {
  m.def("JobBuildAndInferCtx_Open", &JobBuildAndInferCtx_Open);
  m.def("JobBuildAndInferCtx_GetCurrentJobName", &JobBuildAndInferCtx_GetCurrentJobName);
  m.def("JobBuildAndInferCtx_GetCurrentJobId", &JobBuildAndInferCtx_GetCurrentJobId);
  m.def("JobBuildAndInferCtx_Close", &JobBuildAndInferCtx_Close);
-  m.def("CurJobBuildAndInferCtx_CheckJob", &CurJobBuildAndInferCtx_CheckJob);
  m.def("CurJobBuildAndInferCtx_SetJobConf", &CurJobBuildAndInferCtx_SetJobConf);
-  m.def("CurJobBuildAndInferCtx_SetTrainConf", &CurJobBuildAndInferCtx_SetTrainConf);
  m.def("CurJobBuildAndInferCtx_Complete", &CurJobBuildAndInferCtx_Complete,
        py::call_guard<py::gil_scoped_release>());
-  m.def("CurJobBuildAndInferCtx_Rebuild", &CurJobBuildAndInferCtx_Rebuild,
-        py::call_guard<py::gil_scoped_release>());
-  m.def("CurJobBuildAndInferCtx_HasJobConf", &CurJobBuildAndInferCtx_HasJobConf);
-  m.def("CurJobBuildAndInferCtx_AddAndInferMirroredOp",
-        &CurJobBuildAndInferCtx_AddAndInferMirroredOp, py::call_guard<py::gil_scoped_release>());
-  m.def("CurJobBuildAndInferCtx_AddAndInferConsistentOp",
-        &CurJobBuildAndInferCtx_AddAndInferConsistentOp);
-  m.def("CurJobBuildAndInferCtx_AddLbiAndDiffWatcherUuidPair",
-        &CurJobBuildAndInferCtx_AddLbiAndDiffWatcherUuidPair);
-  m.def("JobBuildAndInferCtx_GetSerializedIdListAsStaticShape",
-        &JobBuildAndInferCtx_GetSerializedIdListAsStaticShape);
-  m.def("JobBuildAndInferCtx_GetDataType", &JobBuildAndInferCtx_GetDataType);
-  m.def("JobBuildAndInferCtx_IsDynamic", &JobBuildAndInferCtx_IsDynamic);
-  m.def("JobBuildAndInferCtx_IsDisableBoxing", &JobBuildAndInferCtx_IsDisableBoxing);
-  m.def("JobBuildAndInferCtx_GetSplitAxisFromProducerView",
-        &JobBuildAndInferCtx_GetSplitAxisFromProducerView);
-  m.def("JobBuildAndInferCtx_GetSerializedParallelConfFromProducerView",
-        &JobBuildAndInferCtx_GetSerializedParallelConfFromProducerView);
-  m.def("CurJobBuildAndInferCtx_AddLossLogicalBlobName",
-        &CurJobBuildAndInferCtx_AddLossLogicalBlobName);
-  m.def("JobBuildAndInferCtx_IsMirroredBlob", &JobBuildAndInferCtx_IsMirroredBlob);
-  m.def("JobBuildAndInferCtx_MirroredBlobGetNumSubLbi",
-        &JobBuildAndInferCtx_MirroredBlobGetNumSubLbi);
-  m.def("JobBuildAndInferCtx_MirroredBlobGetSerializedSubLbi",
-        &JobBuildAndInferCtx_MirroredBlobGetSubLbi);
-  m.def("JobBuildAndInferCtx_CheckLbnValidAndExist", &JobBuildAndInferCtx_CheckLbnValidAndExist);
-  m.def("JobBuildAndInferCtx_GetOpBlobLbn", &JobBuildAndInferCtx_GetOpBlobLbn);
 }
 }  // namespace oneflow
--- a/oneflow/api/python/job_build/job_build_and_infer.h
+++ b/oneflow/api/python/job_build/job_build_and_infer.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "oneflow/core/job/global_for.h"
 #include "oneflow/core/common/protobuf.h"
 #include "oneflow/core/framework/tensor.h"
+#include "oneflow/core/framework/tensor_tuple.h"
 #include "oneflow/core/framework/tensor_name_scope.h"
 #include "oneflow/core/job/job_build_and_infer_ctx.h"
 #include "oneflow/core/job/job_build_and_infer_ctx_mgr.h"
@@ -48,129 +49,13 @@ inline Maybe<void> JobBuildAndInferCtx_Close() {
  return Maybe<void>::Ok();
 }
-inline Maybe<void> CurJobBuildAndInferCtx_CheckJob() { return JUST(GetCurInferCtx())->CheckJob(); }
 inline Maybe<void> CurJobBuildAndInferCtx_SetJobConf(const std::string& job_conf_str) {
  JobConfigProto job_conf;
  CHECK_OR_RETURN(TxtString2PbMessage(job_conf_str, &job_conf)) << "job conf parse failed";
  return JUST(GetCurInferCtx())->SetJobConf(job_conf);
 }
-inline Maybe<void> CurJobBuildAndInferCtx_SetTrainConf(const std::string& train_conf_str) {
-  TrainConf train_conf;
-  CHECK_OR_RETURN(TxtString2PbMessage(train_conf_str, &train_conf)) << "train conf parse failed";
-  return JUST(GetCurInferCtx())->SetTrainConf(train_conf);
-}
 inline Maybe<void> CurJobBuildAndInferCtx_Complete() { return JUST(GetCurInferCtx())->Complete(); }
-inline Maybe<void> CurJobBuildAndInferCtx_Rebuild() { return JUST(GetCurInferCtx())->Rebuild(); }
-inline Maybe<bool> CurJobBuildAndInferCtx_HasJobConf() {
-  return JUST(GetCurInferCtx())->HasJobConf();
-}
-inline Maybe<std::string> CurJobBuildAndInferCtx_AddAndInferMirroredOp(
-    const std::string& op_conf_str) {
-  OperatorConf op_conf;
-  CHECK_OR_RETURN(TxtString2PbMessage(op_conf_str, &op_conf)) << "operator conf parse failed";
-  auto* ctx = JUST(GetCurInferCtx());
-  const auto& op_attribute = JUST(ctx->AddAndInferMirroredOp(op_conf));
-  return PbMessage2TxtString(*op_attribute);
-}
-inline Maybe<std::string> CurJobBuildAndInferCtx_AddAndInferConsistentOp(
-    const std::string& op_conf_str) {
-  OperatorConf op_conf;
-  CHECK_OR_RETURN(TxtString2PbMessage(op_conf_str, &op_conf)) << "operator conf parse failed";
-  auto* ctx = JUST(GetCurInferCtx());
-  const auto& op_attribute = JUST(ctx->AddAndInferConsistentOp(op_conf));
-  return PbMessage2TxtString(*op_attribute);
-}
-inline Maybe<void> CurJobBuildAndInferCtx_AddLbiAndDiffWatcherUuidPair(
-    const std::string& lbi_uuid_pair_str) {
-  auto* ctx = JUST(GetCurInferCtx());
-  LbiAndDiffWatcherUuidPair lbi_uuid_pair;
-  CHECK_OR_RETURN(TxtString2PbMessage(lbi_uuid_pair_str, &lbi_uuid_pair))
-      << "LbiAndDiffWatcherUuidPair parse failed";
-  return ctx->AddLbiAndDiffWatcherUuidPair(lbi_uuid_pair);
-}
-inline Maybe<std::string> JobBuildAndInferCtx_GetSerializedIdListAsStaticShape(
-    const std::string& job_name, const std::string& lbn) {
-  auto* ctx = JUST(GetJobBuildAndInferCtx(job_name));
-  const auto& shape = JUST(ctx->GetStaticShape(lbn));
-  Int64List id_list;
-  *id_list.mutable_value() = {shape->dim_vec().begin(), shape->dim_vec().end()};
-  return PbMessage2TxtString(id_list);
-}
-inline Maybe<long long> JobBuildAndInferCtx_GetDataType(const std::string& job_name,
-                                                        const std::string& lbn) {
-  auto* ctx = JUST(GetJobBuildAndInferCtx(job_name));
-  return JUST(ctx->GetDataType(lbn));
-}
-inline Maybe<bool> JobBuildAndInferCtx_IsDynamic(const std::string& job_name,
-                                                 const std::string& lbn) {
-  auto* ctx = JUST(GetJobBuildAndInferCtx(job_name));
-  return ctx->IsDynamic(lbn);
-}
-inline Maybe<bool> JobBuildAndInferCtx_IsDisableBoxing(const std::string& job_name,
-                                                       const std::string& lbn) {
-  auto* ctx = JUST(GetJobBuildAndInferCtx(job_name));
-  return ctx->IsDisableBoxing(lbn);
-}
-inline Maybe<std::string> JobBuildAndInferCtx_GetSplitAxisFromProducerView(
-    const std::string& job_name, const std::string& lbn) {
-  auto* ctx = JUST(GetJobBuildAndInferCtx(job_name));
-  return PbMessage2TxtString(*JUST(ctx->GetSplitAxisFromProducerView(lbn)));
-}
-inline Maybe<std::string> JobBuildAndInferCtx_GetSerializedParallelConfFromProducerView(
-    const std::string& job_name, const std::string& lbn) {
-  auto* ctx = JUST(GetJobBuildAndInferCtx(job_name));
-  return PbMessage2TxtString(JUST(ctx->GetParallelDescFromProducerView(lbn))->parallel_conf());
-}
-inline Maybe<void> CurJobBuildAndInferCtx_AddLossLogicalBlobName(const std::string& lbn) {
-  return JUST(GetCurInferCtx())->AddLossLogicalBlobName(lbn);
-}
-inline Maybe<bool> JobBuildAndInferCtx_IsMirroredBlob(const std::string& job_name,
-                                                      const std::string& lbn) {
-  auto* ctx = JUST(GetJobBuildAndInferCtx(job_name));
-  return ctx->IsMirroredBlob(lbn);
-}
-inline Maybe<int> JobBuildAndInferCtx_MirroredBlobGetNumSubLbi(const std::string& job_name,
-                                                               const std::string& lbn) {
-  auto* ctx = JUST(GetJobBuildAndInferCtx(job_name));
-  return ctx->MirroredBlobGetNumSubLbi(lbn);
-}
-inline Maybe<std::string> JobBuildAndInferCtx_MirroredBlobGetSubLbi(const std::string& job_name,
-                                                                    const std::string& lbn,
-                                                                    int index) {
-  auto* ctx = JUST(GetJobBuildAndInferCtx(job_name));
-  return PbMessage2TxtString(*JUST(ctx->MirroredBlobGetSubLbi(lbn, index)));
-}
-inline Maybe<void> JobBuildAndInferCtx_CheckLbnValidAndExist(const std::string& job_name,
-                                                             const std::string& lbn) {
-  auto* ctx = JUST(GetJobBuildAndInferCtx(job_name));
-  JUST(ctx->CheckLbnValidAndExist(lbn));
-  return Maybe<void>::Ok();
-}
-inline Maybe<std::string> JobBuildAndInferCtx_GetOpBlobLbn(const std::string& job_name,
-                                                           const std::string& op_name,
-                                                           const std::string bn_in_op) {
-  const auto* job_ctx = JUST(GetJobBuildAndInferCtx(job_name));
-  return job_ctx->GetOpBlobLbn(op_name, bn_in_op);
-}
 inline Maybe<void> AddTensorAsGraphLoss(const std::shared_ptr<one::Tensor>& t) {
  CHECK_OR_RETURN(t->is_lazy());
@@ -180,6 +65,11 @@ inline Maybe<void> AddTensorAsGraphLoss(const std::shared_ptr<one::Tensor>& t) {
  return JUST(GetCurInferCtx())->AddLossLogicalBlobName(loss_lbn);
 }
+Maybe<void> MarkVariableGradients(const one::TensorTuple& variables,
+                                  const one::TensorTuple& gradients);
+Maybe<void> MarkOutputGradients(const one::TensorTuple& outputs, const one::TensorTuple& gradients);
 }  // namespace oneflow
 #endif  // ONEFLOW_API_PYTHON_JOB_BUILD_JOB_BUILD_AND_INFER_H_
--- a/oneflow/api/python/of_api_registry.h
+++ b/oneflow/api/python/of_api_registry.h
@@ -20,8 +20,10 @@ limitations under the License.
 #include <vector>
 #include <functional>
 #include "oneflow/api/python/caster/maybe.h"
-#include "oneflow/api/python/caster/tensor.h"
 #include "oneflow/api/python/caster/optional.h"
+#include "oneflow/api/python/caster/size.h"
+#include "oneflow/api/python/caster/tensor.h"
+#include "oneflow/api/python/caster/autograd_function_state.h"
 #include "oneflow/core/common/preprocessor.h"
 namespace oneflow {

--- a/oneflow/api/python/ofblob/ofblob.cpp
+++ b/oneflow/api/python/ofblob/ofblob.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include <pybind11/pybind11.h>
-#include "oneflow/api/python/of_api_registry.h"
-#include "oneflow/core/common/preprocessor.h"
-#include "oneflow/core/common/data_type_seq.h"
-#include "oneflow/api/python/ofblob/ofblob.h"
-#include "oneflow/api/python/ofblob/ofblob.e.h"
-namespace py = pybind11;
-ONEFLOW_API_PYBIND11_MODULE("", m) {
-  m.def("Ofblob_GetDataType", &Ofblob_GetDataType);
-  m.def("OfBlob_NumAxes", &OfBlob_NumAxes);
-  m.def("OfBlob_IsDynamic", &OfBlob_IsDynamic);
-  m.def("OfBlob_CopyShapeTo", &OfBlob_CopyShapeTo);
-  m.def("OfBlob_CopyStaticShapeTo", &OfBlob_CopyStaticShapeTo);
-  m.def("OfBlob_CopyShapeFrom", &OfBlob_CopyShapeFrom);
-  m.def("Dtype_GetOfBlobCopyToBufferFuncName", &Dtype_GetOfBlobCopyToBufferFuncName);
-  m.def("Dtype_GetOfBlobCopyFromBufferFuncName", &Dtype_GetOfBlobCopyFromBufferFuncName);
-#define EXPORT_COPY_DATA_API(T, type_proto)                  \
-  m.def("OfBlob_CopyToBuffer_" OF_PP_STRINGIZE(T),           \
-        [](uint64_t of_blob_ptr, py::array_t<T> array) {     \
-          oneflow::NumPyArrayPtr array_ptr(array.ptr());     \
-          OfBlob_CopyToBuffer_##T(of_blob_ptr, array_ptr);   \
-        });                                                  \
-  m.def("OfBlob_CopyFromBuffer_" OF_PP_STRINGIZE(T),         \
-        [](uint64_t of_blob_ptr, py::array_t<T> array) {     \
-          oneflow::NumPyArrayPtr array_ptr(array.ptr());     \
-          OfBlob_CopyFromBuffer_##T(of_blob_ptr, array_ptr); \
-        });
-  OF_PP_FOR_EACH_TUPLE(EXPORT_COPY_DATA_API, POD_DATA_TYPE_SEQ);
-#undef EXPORT_COPY_DATA_API
-}
--- a/oneflow/api/python/ofblob/ofblob.e.h
+++ b/oneflow/api/python/ofblob/ofblob.e.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_API_PYTHON_OFBLOB_OFBLOB_E_H_
-#define ONEFLOW_API_PYTHON_OFBLOB_OFBLOB_E_H_
-#include "oneflow/core/common/foreign_lock_helper.h"
-#include "oneflow/core/common/type_traits.h"
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include "oneflow/core/register/ofblob.h"
-#include "oneflow/core/common/preprocessor.h"
-#include "oneflow/core/common/data_type_seq.h"
-#include "oneflow/core/common/maybe.h"
-#include "oneflow/api/common/ofblob.h"
-#include "oneflow/extension/python/numpy.h"
-namespace py = pybind11;
-namespace oneflow {
-template<typename T>
-struct BlobNumpyCopyUtil {
-  static Maybe<void> From(uint64_t of_blob_ptr, const NumPyArrayPtr& array) {
-    return BlobBufferCopyUtil<T>::From(of_blob_ptr, (T*)array.data(), array.size());
-  }
-  static Maybe<void> To(uint64_t of_blob_ptr, const NumPyArrayPtr& array) {
-    return BlobBufferCopyUtil<T>::To(of_blob_ptr, (T*)array.data(), array.size());
-  }
-};
-}  // namespace oneflow
-#define DEFINE_COPIER(T, type_proto)                                                               \
-  inline void OfBlob_CopyToBuffer_##T(uint64_t of_blob_ptr, const oneflow::NumPyArrayPtr& array) { \
-    oneflow::BlobNumpyCopyUtil<T>::To(of_blob_ptr, array).GetOrThrow();                            \
-  }                                                                                                \
-  inline void OfBlob_CopyFromBuffer_##T(uint64_t of_blob_ptr,                                      \
-                                        const oneflow::NumPyArrayPtr& array) {                     \
-    oneflow::BlobNumpyCopyUtil<T>::From(of_blob_ptr, array).GetOrThrow();                          \
-  }
-OF_PP_FOR_EACH_TUPLE(DEFINE_COPIER, POD_DATA_TYPE_SEQ);
-#undef DEFINE_COPIER
-inline std::string Dtype_GetOfBlobCopyToBufferFuncName(int64_t dtype) {
-  using namespace oneflow;
-  static const HashMap<int64_t, std::string> data_type2func_name{
-#define DATA_TYPE_FUNC_NAME_PAIR(type_cpp, type_proto) \
-  {type_proto, "OfBlob_CopyToBuffer_" #type_cpp},
-      OF_PP_FOR_EACH_TUPLE(DATA_TYPE_FUNC_NAME_PAIR, POD_DATA_TYPE_SEQ)
-#undef DATA_TYPE_FUNC_NAME_PAIR
-  };
-  return data_type2func_name.at(dtype);
-}
-inline std::string Dtype_GetOfBlobCopyFromBufferFuncName(int64_t dtype) {
-  using namespace oneflow;
-  static const HashMap<int64_t, std::string> data_type2func_name{
-#define DATA_TYPE_FUNC_NAME_PAIR(type_cpp, type_proto) \
-  {type_proto, "OfBlob_CopyFromBuffer_" #type_cpp},
-      OF_PP_FOR_EACH_TUPLE(DATA_TYPE_FUNC_NAME_PAIR, POD_DATA_TYPE_SEQ)
-#undef DATA_TYPE_FUNC_NAME_PAIR
-  };
-  return data_type2func_name.at(dtype);
-}
-#endif  // ONEFLOW_API_PYTHON_OFBLOB_OFBLOB_E_H_
--- a/oneflow/api/python/ofblob/ofblob.h
+++ b/oneflow/api/python/ofblob/ofblob.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_API_PYTHON_OFBLOB_OFBLOB_H_
-#define ONEFLOW_API_PYTHON_OFBLOB_OFBLOB_H_
-#include "oneflow/core/common/type_traits.h"
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include "oneflow/core/register/ofblob.h"
-namespace py = pybind11;
-inline int Ofblob_GetDataType(uint64_t of_blob_ptr) {
-  using namespace oneflow;
-  auto* of_blob = reinterpret_cast<OfBlob*>(of_blob_ptr);
-  return of_blob->data_type();
-}
-inline size_t OfBlob_NumAxes(uint64_t of_blob_ptr) {
-  using namespace oneflow;
-  auto* of_blob = reinterpret_cast<OfBlob*>(of_blob_ptr);
-  return of_blob->NumAxes();
-}
-inline bool OfBlob_IsDynamic(uint64_t of_blob_ptr) {
-  using namespace oneflow;
-  auto* of_blob = reinterpret_cast<OfBlob*>(of_blob_ptr);
-  return of_blob->is_dynamic();
-}
-inline void OfBlob_CopyShapeFrom(uint64_t of_blob_ptr, py::array_t<int64_t> array) {
-  py::buffer_info buf = array.request();
-  int64_t* buf_ptr = (int64_t*)buf.ptr;
-  size_t size = buf.size;
-  using namespace oneflow;
-  auto* of_blob = reinterpret_cast<OfBlob*>(of_blob_ptr);
-  return of_blob->CopyShapeFrom(buf_ptr, size);
-}
-inline void OfBlob_CopyShapeTo(uint64_t of_blob_ptr, py::array_t<int64_t> array) {
-  py::buffer_info buf = array.request();
-  int64_t* buf_ptr = (int64_t*)buf.ptr;
-  size_t size = buf.size;
-  using namespace oneflow;
-  auto* of_blob = reinterpret_cast<OfBlob*>(of_blob_ptr);
-  return of_blob->CopyShapeTo(buf_ptr, size);
-}
-inline void OfBlob_CopyStaticShapeTo(uint64_t of_blob_ptr, py::array_t<int64_t> array) {
-  py::buffer_info buf = array.request();
-  int64_t* buf_ptr = (int64_t*)buf.ptr;
-  size_t size = buf.size;
-  using namespace oneflow;
-  auto* of_blob = reinterpret_cast<OfBlob*>(of_blob_ptr);
-  return of_blob->CopyStaticShapeTo(buf_ptr, size);
-}
-#endif  // ONEFLOW_API_PYTHON_OFBLOB_OFBLOB_H_
--- a/oneflow/api/python/rpc/ccl.cpp
+++ b/oneflow/api/python/rpc/ccl.cpp
@@ -35,12 +35,16 @@ Maybe<py::bytes> CpuBroadcast(py::bytes* in, int64_t root) {
    CHECK_NOTNULL_OR_RETURN(in);
    PyBytes_AsStringAndSize(in->ptr(), &buffer, &length);
  }
-  JUST(ccl::Broadcast<DeviceType::kCPU>(&length, &length, sizeof(length), DataType::kChar, root,
+  const auto& meta_transport_token =
-                                        parallel_desc, nullptr));
+      JUST(TransportToken::NewTransportToken(kTransportTokenTypeMeta));
+  JUST(ccl::CpuBroadcast(&length, &length, sizeof(length), root, parallel_desc,
+                         meta_transport_token));
+  const auto& data_transport_token =
+      JUST(TransportToken::NewTransportToken(kTransportTokenTypeData));
  if (GlobalProcessCtx::Rank() == root) {
-    JUST(ccl::Broadcast<DeviceType::kCPU>(buffer, buffer, length, DataType::kChar, root,  // NOLINT
+    JUST(ccl::CpuBroadcast(buffer, buffer, length, root, parallel_desc,  // NOLINT
-                                          parallel_desc, nullptr));
+                           data_transport_token));                       // NOLINT
    return *in;
  } else {
    // https://github.com/pybind/pybind11/issues/1236#issuecomment-527730864
@@ -51,8 +55,7 @@ Maybe<py::bytes> CpuBroadcast(py::bytes* in, int64_t root) {
    bytesObject->ob_shash = -1;
    bytesObject->ob_sval[length] = '\0';
    buffer = bytesObject->ob_sval;
-    JUST(ccl::Broadcast<DeviceType::kCPU>(nullptr, buffer, length, DataType::kChar, root,
+    JUST(ccl::CpuBroadcast(nullptr, buffer, length, root, parallel_desc, data_transport_token));
-                                          parallel_desc, nullptr));
    return py::reinterpret_steal<py::bytes>(reinterpret_cast<PyObject*>(bytesObject));
  }
 }

--- a/oneflow/api/python/rpc/consistent_rpc_token_scope.cpp
+++ b/oneflow/api/python/rpc/consistent_rpc_token_scope.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <pybind11/functional.h>
-#include "oneflow/api/python/of_api_registry.h"
-#include "oneflow/core/thread/thread_consistent_id.h"
-#include "oneflow/core/framework/rank_group_rpc_util.h"
-#include "oneflow/core/job/rank_group.h"
-#include "oneflow/core/job/rank_group_scope.h"
-#include "oneflow/core/common/symbol.h"
-namespace py = pybind11;
-namespace oneflow {
-namespace {
-Maybe<void> InitConsistentTransportTokenScope(const std::string& thread_tag,
-                                              int64_t thread_consistent_id,
-                                              Symbol<RankGroup> rank_group) {
-  JUST(InitThisThreadUniqueConsistentId(thread_consistent_id, thread_tag));
-  static thread_local const auto& init_rank_group_scope =
-      JUST(RankGroupScope::MakeInitialRankGroupScope(rank_group));
-  // no unused warning for `init_rank_group_scope`.
-  (void)(init_rank_group_scope);
-  return Maybe<void>::Ok();
-}
-Maybe<void> InitConsistentTransportTokenScope(const std::string& thread_tag,
-                                              int64_t thread_consistent_id) {
-  const auto& rank_group = JUST(RankGroup::DefaultRankGroup());
-  JUST(InitConsistentTransportTokenScope(thread_tag, thread_consistent_id, rank_group));
-  return Maybe<void>::Ok();
-}
-Maybe<void> ApiInitDefaultConsistentTransportTokenScope() {
-  return InitConsistentTransportTokenScope("main", kThreadConsistentIdMain);
-}
-}  // namespace
-ONEFLOW_API_PYBIND11_MODULE("", m) {
-  m.def("InitDefaultConsistentTransportTokenScope", &ApiInitDefaultConsistentTransportTokenScope);
-}
-}  // namespace oneflow
--- a/oneflow/api/python/rpc/global_rpc_token_scope.cpp
+++ b/oneflow/api/python/rpc/global_rpc_token_scope.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/functional.h>
+#include "oneflow/api/python/of_api_registry.h"
+#include "oneflow/core/thread/thread_global_id.h"
+#include "oneflow/core/framework/rank_group_rpc_util.h"
+#include "oneflow/core/job/rank_group.h"
+#include "oneflow/core/job/rank_group_scope.h"
+#include "oneflow/core/common/symbol.h"
+namespace py = pybind11;
+namespace oneflow {
+namespace {
+Maybe<void> InitGlobalTransportTokenScope(const std::string& thread_tag, int64_t thread_global_id,
+                                          Symbol<RankGroup> rank_group) {
+  JUST(InitThisThreadUniqueGlobalId(thread_global_id, thread_tag));
+  static thread_local const auto& init_rank_group_scope =
+      JUST(RankGroupScope::MakeInitialRankGroupScope(rank_group));
+  // no unused warning for `init_rank_group_scope`.
+  (void)(init_rank_group_scope);
+  return Maybe<void>::Ok();
+}
+Maybe<void> InitGlobalTransportTokenScope(const std::string& thread_tag, int64_t thread_global_id) {
+  const auto& rank_group = JUST(RankGroup::DefaultRankGroup());
+  JUST(InitGlobalTransportTokenScope(thread_tag, thread_global_id, rank_group));
+  return Maybe<void>::Ok();
+}
+Maybe<void> ApiInitDefaultGlobalTransportTokenScope() {
+  return InitGlobalTransportTokenScope("main", kThreadGlobalIdMain);
+}
+}  // namespace
+ONEFLOW_API_PYBIND11_MODULE("", m) {
+  m.def("InitDefaultGlobalTransportTokenScope", &ApiInitDefaultGlobalTransportTokenScope);
+}
+}  // namespace oneflow
--- a/oneflow/api/python/session/session.cpp
+++ b/oneflow/api/python/session/session.cpp
@@ -20,21 +20,12 @@ limitations under the License.
 #include "oneflow/core/job/session.h"
 #include "oneflow/core/job/env_global_objects_scope.h"
 #include "oneflow/core/framework/multi_client_session_context.h"
-#include "oneflow/api/python/session/session.h"
 namespace py = pybind11;
 namespace oneflow {
 ONEFLOW_API_PYBIND11_MODULE("", m) {
-  m.def("IsSessionInited", &IsSessionInited);
-  m.def("InitLazyGlobalSession", &InitLazyGlobalSession);
-  m.def("InitEagerGlobalSession", &InitEagerGlobalSession);
-  m.def("DestroyLazyGlobalSession", &DestroyLazyGlobalSession);
-  m.def("StartLazyGlobalSession", &StartLazyGlobalSession);
-  m.def("StopLazyGlobalSession", &StopLazyGlobalSession);
  using namespace oneflow;
  py::class_<MultiClientSessionContext, std::shared_ptr<MultiClientSessionContext>>(
      m, "SessionContext")
@@ -43,6 +34,8 @@ ONEFLOW_API_PYBIND11_MODULE("", m) {
           [](MultiClientSessionContext& session, const std::string& config_proto_str) {
             return session.TryInit(config_proto_str).GetOrThrow();
           })
+      .def("try_close",
+           [](MultiClientSessionContext& session) { return session.TryClose().GetOrThrow(); })
      .def("update_resource",
           [](MultiClientSessionContext& session, const std::string& reso_proto_str) {
             return session.UpdateResource(reso_proto_str).GetOrThrow();

--- a/oneflow/api/python/session/session.h
+++ b/oneflow/api/python/session/session.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_API_PYTHON_SESSION_SESSION_H_
-#define ONEFLOW_API_PYTHON_SESSION_SESSION_H_
-#include <string>
-#include <google/protobuf/text_format.h>
-#include "oneflow/core/common/protobuf.h"
-#include "oneflow/core/control/ctrl_client.h"
-#include "oneflow/core/control/global_process_ctx.h"
-#include "oneflow/core/job/global_for.h"
-#include "oneflow/core/job/env_global_objects_scope.h"
-#include "oneflow/core/job/session_global_objects_scope.h"
-#include "oneflow/core/job/cluster_instruction.h"
-#include "oneflow/core/job/oneflow.h"
-#include "oneflow/core/job/job_build_and_infer_ctx_mgr.h"
-#include "oneflow/core/job/resource_desc.h"
-#include "oneflow/core/framework/config_def.h"
-#include "oneflow/core/framework/multi_client_session_context.h"
-#include "oneflow/core/framework/nn_graph.h"
-#include "oneflow/core/persistence/tee_persistent_log_stream.h"
-namespace oneflow {
-inline Maybe<bool> IsSessionInited() {
-  return Singleton<SessionGlobalObjectsScope>::Get() != nullptr;
-}
-inline void FixCpuDeviceNum(ConfigProto* config_proto) {
-  if (config_proto->resource().cpu_device_num() > 0) { return; }
-  config_proto->mutable_resource()->set_cpu_device_num(std::thread::hardware_concurrency());
-}
-inline Maybe<void> InitEagerGlobalSession(const std::string& config_proto_str) {
-  CHECK_NOTNULL_OR_RETURN(Singleton<EnvDesc>::Get()) << "env not found";
-  ConfigProto config_proto;
-  CHECK_OR_RETURN(TxtString2PbMessage(config_proto_str, &config_proto))
-      << "failed to parse config_proto: " << config_proto_str;
-  FixCpuDeviceNum(&config_proto);
-  Singleton<CtrlClient>::Get()->PushKV("config_proto", config_proto);
-  CHECK_ISNULL_OR_RETURN(Singleton<SessionGlobalObjectsScope>::Get());
-  Singleton<SessionGlobalObjectsScope>::SetAllocated(new SessionGlobalObjectsScope());
-  JUST(Singleton<SessionGlobalObjectsScope>::Get()->EagerInit(config_proto));
-  VLOG(3) << "NewGlobal " << typeid(SessionGlobalObjectsScope).name();
-  return Maybe<void>::Ok();
-}
-inline Maybe<void> InitLazyGlobalSession(const std::string& config_proto_str) {
-  CHECK_NOTNULL_OR_RETURN(Singleton<EnvDesc>::Get()) << "env not found";
-  CHECK_OR_RETURN(GlobalProcessCtx::IsThisProcessMaster());
-  ClusterInstruction::MasterSendSessionStart();
-  ConfigProto config_proto;
-  CHECK_OR_RETURN(TxtString2PbMessage(config_proto_str, &config_proto))
-      << "failed to parse config_proto: " << config_proto_str;
-  FixCpuDeviceNum(&config_proto);
-  Singleton<CtrlClient>::Get()->PushKV("config_proto", config_proto);
-  CHECK_ISNULL_OR_RETURN(Singleton<SessionGlobalObjectsScope>::Get());
-  Singleton<SessionGlobalObjectsScope>::SetAllocated(new SessionGlobalObjectsScope());
-  JUST(Singleton<SessionGlobalObjectsScope>::Get()->Init(config_proto));
-  VLOG(3) << "NewGlobal " << typeid(SessionGlobalObjectsScope).name();
-  return Maybe<void>::Ok();
-}
-inline Maybe<void> DestroyLazyGlobalSession() {
-  if (Singleton<SessionGlobalObjectsScope>::Get() == nullptr) { return Maybe<void>::Ok(); }
-  CHECK_OR_RETURN(GlobalProcessCtx::IsThisProcessMaster());
-  Singleton<SessionGlobalObjectsScope>::Delete();
-  return Maybe<void>::Ok();
-}
-inline Maybe<void> StartLazyGlobalSession() {
-  CHECK_NOTNULL_OR_RETURN(Singleton<SessionGlobalObjectsScope>::Get()) << "session not found";
-  CHECK_OR_RETURN(GlobalProcessCtx::IsThisProcessMaster());
-  const JobSet& job_set = Singleton<LazyJobBuildAndInferCtxMgr>::Get()->job_set();
-  if (Singleton<ResourceDesc, ForSession>::Get()->enable_debug_mode()) {
-    TeePersistentLogStream::Create("job_set.prototxt")->Write(job_set);
-  }
-  if (job_set.job().empty()) { return Error::JobSetEmptyError() << "no function defined"; }
-  CHECK_ISNULL_OR_RETURN(Singleton<Oneflow>::Get());
-  Singleton<CtrlClient>::Get()->PushKV("session_job_set", job_set);
-  Singleton<const InterJobReuseMemStrategy>::New(job_set.inter_job_reuse_mem_strategy());
-  Singleton<Oneflow>::New();
-  JUST(Singleton<Oneflow>::Get()->Init(job_set));
-  return Maybe<void>::Ok();
-}
-inline Maybe<void> StopLazyGlobalSession() {
-  if (Singleton<Oneflow>::Get() == nullptr) { return Maybe<void>::Ok(); }
-  CHECK_OR_RETURN(GlobalProcessCtx::IsThisProcessMaster());
-  CHECK_NOTNULL_OR_RETURN(Singleton<Oneflow>::Get());
-  Singleton<Oneflow>::Delete();
-  Singleton<const InterJobReuseMemStrategy>::Delete();
-  return Maybe<void>::Ok();
-}
-}  // namespace oneflow
-#endif  // ONEFLOW_API_PYTHON_SESSION_SESSION_H_
--- a/oneflow/api/python/stack_getter.cpp
+++ b/oneflow/api/python/stack_getter.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <utility>
+#include "pybind11/pybind11.h"
+#include "oneflow/api/python/of_api_registry.h"
+#include "oneflow/core/common/singleton.h"
+#include "oneflow/extension/stack/foreign_stack_getter.h"
+#include "oneflow/extension/stack/python/stack_getter.h"
+namespace py = pybind11;
+namespace oneflow {
+ONEFLOW_API_PYBIND11_MODULE("", m) {
+  m.def("RegisterStackGetter", &RegisterPyStackGetter);
+  m.def("GetCurrentStack", []() {
+    auto* stack_getter = Singleton<ForeignStackGetter>::Get();
+    return stack_getter->GetFormattedStack(stack_getter->GetCurrentFrame());
+  });
+}
+}  // namespace oneflow
--- a/oneflow/api/python/symbol/placement_symbol.cpp
+++ b/oneflow/api/python/symbol/placement_symbol.cpp
@@ -257,8 +257,8 @@ ONEFLOW_API_PYBIND11_MODULE("", m) {
      .def("__str__", PlacementToString)
      .def("__repr__", PlacementToString)
      .def(py::self == py::self)
-      .def(py::hash(py::self));
+      .def(py::hash(py::self))
-  m.def("AllDevicePlacement", &PlacementSymbolExportUtil::AllDevicePlacement);
+      .def_static("all", &PlacementSymbolExportUtil::AllDevicePlacement);
 }
 }  // namespace oneflow