0.9.1-rocm

a715222c · yuguo · f262efc9 · a715222c · a715222c · a715222c
Commit a715222c authored Feb 28, 2023 by yuguo
20 changed files
--- a/oneflow/api/common/variable_tensor_mgr.h
+++ b/oneflow/api/common/variable_tensor_mgr.h
@@ -28,9 +28,10 @@ inline Maybe<void> FillVariableTensorMgr(
  auto mgr = Singleton<VariableTensorMgr>::Get();
  return mgr->Fill(variable_op_names, variable_tensors);
 }
-inline void ClearVariableTensorMgr() {
+
+inline void ResetVariableTensorMgr() {
  auto mgr = Singleton<VariableTensorMgr>::Get();
-  mgr->Clear();
+  mgr->Reset();
 }

 inline std::tuple<std::vector<std::string>, std::vector<std::shared_ptr<one::Tensor>>>

--- a/oneflow/api/cpp/embedding/embedding.cpp
+++ b/oneflow/api/cpp/embedding/embedding.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/api/cpp/embedding/embedding.h"
+#include "oneflow/core/embedding/embedding_manager.h"
+
+namespace oneflow_api {
+namespace embedding {
+
+std::string CreateKeyValueStore(const std::string& key_value_store_options, int64_t local_rank_id,
+                                int64_t rank_id, int64_t world_size) {
+  oneflow::embedding::KeyValueStoreOptions options(key_value_store_options);
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+  oneflow::Singleton<oneflow::embedding::EmbeddingManager>::Get()->CreateKeyValueStore(
+      options, local_rank_id, rank_id, world_size);
+  return options.Name();
+#else
+  UNIMPLEMENTED() << "OneEmbedding Only Support with CUDA";
+#endif
+  return "";
+}
+
+void LoadSnapshot(const std::string& snapshot_name, const std::string& embedding_name,
+                  int64_t local_rank_id, int64_t rank_id) {
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+  oneflow::Singleton<oneflow::embedding::EmbeddingManager>::Get()->LoadSnapshot(
+      embedding_name, local_rank_id, rank_id, snapshot_name);
+#else
+  UNIMPLEMENTED() << "OneEmbedding Only Support with CUDA";
+#endif
+}
+
+}  // namespace embedding
+}  // namespace oneflow_api
--- a/oneflow/api/cpp/embedding/embedding.h
+++ b/oneflow/api/cpp/embedding/embedding.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_API_CPP_ONE_EMBEDDING_ONE_EMBEDDING_H_
+#define ONEFLOW_API_CPP_ONE_EMBEDDING_ONE_EMBEDDING_H_
+
+#include <string>
+
+namespace oneflow_api {
+namespace embedding {
+
+// CreateKeyValueStore returns embedding name in the options.
+std::string CreateKeyValueStore(const std::string& key_value_store_options, int64_t local_rank_id,
+                                int64_t rank_id,
+                                int64_t world_size);  // key_value_store_options is
+                                                      // a serialized json string.
+void LoadSnapshot(const std::string& snapshot_name, const std::string& embedding_name,
+                  int64_t local_rank_id, int64_t rank_id);
+
+}  // namespace embedding
+}  // namespace oneflow_api
+
+#endif  // ONEFLOW_API_CPP_ONE_EMBEDDING_ONE_EMBEDDING_H_
--- a/oneflow/api/cpp/env.cpp
+++ b/oneflow/api/cpp/env.cpp
@@ -18,7 +18,7 @@ limitations under the License.
 #include "oneflow/api/cpp/env.h"
 #include "oneflow/api/cpp/env_impl.h"
 #include "oneflow/core/framework/shut_down_util.h"
-#include "oneflow/core/thread/thread_consistent_id.h"
+#include "oneflow/core/thread/thread_global_id.h"

 namespace oneflow_api {
 void initialize() {
@@ -29,7 +29,7 @@ void initialize() {
 void release() {
  if (of::Singleton<OneFlowEnv>::Get() != nullptr) { of::Singleton<OneFlowEnv>::Delete(); }
  of::SetShuttingDown();
-  of::ResetThisThreadUniqueConsistentId().GetOrThrow();
+  of::ResetThisThreadUniqueGlobalId().GetOrThrow();
 }

 }  // namespace oneflow_api
--- a/oneflow/api/cpp/env_impl.cpp
+++ b/oneflow/api/cpp/env_impl.cpp
@@ -107,6 +107,9 @@ void CompleteEnvProto(of::EnvProto& env_proto) {
  if (HasEnvVar("GLOG_logbuflevel")) {
    cpp_logging_conf->set_logbuflevel(GetEnvVar("GLOG_logbuflevel", -1));
  }
+  if (HasEnvVar("GLOG_minloglevel")) {
+    cpp_logging_conf->set_minloglevel(GetEnvVar("GLOG_minloglevel", -1));
+  }
 }
 }  // namespace

@@ -119,15 +122,15 @@ OneFlowEnv::OneFlowEnv() {
  of::ConfigProto config_proto;
  config_proto.mutable_resource()->set_cpu_device_num(1);  // useless, will be set in TryInit
  const int64_t session_id = of::NewSessionId();
-  CHECK_JUST(of::RegsiterSession(session_id));
  config_proto.set_session_id(session_id);
-
+  CHECK(of::RegsterSessionId(session_id));
  session_ctx_ = std::make_shared<of::MultiClientSessionContext>(env_ctx_);
  CHECK_JUST(session_ctx_->TryInit(config_proto));
 }

 OneFlowEnv::~OneFlowEnv() {
  session_ctx_.reset();
+  CHECK(of::ClearSessionId(CHECK_JUST(of::GetDefaultSessionId())));
  env_ctx_.reset();
 }


--- a/oneflow/api/cpp/framework/graph.cpp
+++ b/oneflow/api/cpp/framework/graph.cpp
@@ -13,8 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-
-#include "oneflow/api/common/ofblob.h"
+#include "nlohmann/json.hpp"
 #include "oneflow/api/common/variable_tensor_mgr.h"
 #include "oneflow/api/cpp/env_impl.h"
 #include "oneflow/api/cpp/framework/device.h"
@@ -23,6 +22,7 @@ limitations under the License.
 #include "oneflow/api/cpp/framework/ivalue.h"
 #include "oneflow/api/cpp/framework/shape.h"
 #include "oneflow/api/cpp/framework/tensor.h"
+#include "oneflow/api/cpp/embedding/embedding.h"
 #include "oneflow/api/common/job_build_and_infer_ctx.h"
 #include "oneflow/api/python/job_build/job_build_and_infer.h"
 #include "oneflow/core/common/data_type.pb.h"
@@ -32,6 +32,8 @@ limitations under the License.
 #include "oneflow/core/common/shape.h"
 #include "oneflow/core/common/symbol.h"
 #include "oneflow/core/common/util.h"
+#include "oneflow/core/embedding/posix_file.h"
+#include "oneflow/core/eager/eager_blob_object.h"
 #include "oneflow/core/framework/device.h"
 #include "oneflow/core/framework/dtype.h"
 #include "oneflow/core/framework/multi_client_session_context.h"
@@ -52,6 +54,8 @@ limitations under the License.
 #include "oneflow/core/job/parallel_desc.h"
 #include "oneflow/core/job/scope.h"
 #include "oneflow/core/job/session.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/core/memory/memory_case_util.h"
 #include "oneflow/core/operator/interface_blob_conf.pb.h"
 #include "oneflow/core/operator/op_conf.pb.h"
 #include "oneflow/core/register/logical_blob_id.pb.h"
@@ -108,6 +112,30 @@ Shape OfShapeToOfApiShape(const of::Shape& of_shape) {
  return Shape(dims);
 }

+#ifdef __linux__
+
+void LoadOneEmbedding(const std::string& model_path, const Device& device) {
+  const std::string one_embedding_info_name("one_embedding_options.json");
+  const std::string one_embedding_info_save_path(
+      oneflow::JoinPath(model_path, one_embedding_info_name));
+  if (oneflow::embedding::PosixFile::FileExists(one_embedding_info_save_path)) {
+    std::ifstream one_embedding_info_file(one_embedding_info_save_path);
+    auto one_embedding_json = nlohmann::json::parse(one_embedding_info_file);
+    for (auto& it : one_embedding_json["embedding"]) {
+      const std::string snapshot_path = it["snapshot"];
+      auto kv_options_json = it["kv_options"];
+      std::string embedding_name = embedding::CreateKeyValueStore(kv_options_json.dump(),
+                                                                  /*local_rank_id=*/0,
+                                                                  /*rank_id=*/0,
+                                                                  /*world_size=*/1);
+      embedding::LoadSnapshot(snapshot_path, embedding_name, /*local_rank_id=*/0,
+                              /*rank_id=*/0);
+    }
+  }
+}
+
+#endif  // __linux__
+
 }  // namespace

 class Graph::GraphImpl final {
@@ -202,6 +230,9 @@ IValue Graph::Forward(const IValue& inputs) {
 void Graph::set_batch_size(int batch_size) { graph_->set_batch_size(batch_size); }

 Graph Graph::Load(const std::string& model_path, const Device& device) {
+#ifdef __linux__
+  LoadOneEmbedding(model_path, device);
+#endif  // __linux__
  Graph graph(model_path, device);
  return graph;
 }
@@ -306,7 +337,7 @@ of::Maybe<void> Graph::GraphImpl::AddOp(of::OperatorConf op_conf) {
        0, batch_size_);
  }
  auto* ctx = JUST(of::GetCurInferCtx());
-  JUST(ctx->AddAndInferConsistentOp(op_conf));
+  JUST(ctx->AddAndInferGlobalOp(op_conf));
  return of::Maybe<void>::Ok();
 }

@@ -374,11 +405,12 @@ of::Maybe<void> Graph::GraphImpl::LoadCheckpoint() {
      ss << variable_file.rdbuf();
      return ss.str();
    }();
-    const auto& callback = [&](uint64_t of_blob_ptr) {
-      CHECK_JUST(of::BlobBufferCopyUtil<void>::From(
-          of_blob_ptr, buffer.data(),
-          variable_tensor->shape()->elem_cnt()
-              * of::GetSizeOfDataType(variable_tensor->dtype()->data_type())));
+    const auto& callback = [&](of::ep::Stream* stream,
+                               const std::shared_ptr<of::vm::EagerBlobObject>& eager_blob_object) {
+      of::AutoMemcpy(stream, eager_blob_object->mut_dptr(), buffer.data(),
+                     variable_tensor->shape()->elem_cnt()
+                         * of::GetSizeOfDataType(variable_tensor->dtype()->data_type()),
+                     eager_blob_object->mem_case(), of::memory::MakeHostMemCase());
    };
    JUST(of::one::SyncAccessTensorWithTimeOut(variable_tensor, callback, "mut"));
  }

--- a/oneflow/api/cpp/framework/tensor.cpp
+++ b/oneflow/api/cpp/framework/tensor.cpp
@@ -21,9 +21,8 @@ limitations under the License.
 #include "oneflow/core/functional/functional.h"
 #include "oneflow/core/framework/dtype.h"
 #include "oneflow/core/job/lazy_mode.h"
+#include "oneflow/core/kernel/kernel_util.h"
 #include "oneflow/core/framework/instructions_builder.h"
-#include "oneflow/core/register/ofblob.h"
-#include "oneflow/api/common/ofblob.h"
 #include "oneflow/core/framework/dtype.h"
 #include "oneflow/core/vm/virtual_machine.h"

@@ -68,14 +67,14 @@ Device Tensor::device() const {
 DType Tensor::dtype() const { return static_cast<DType>(tensor_->dtype()->data_type()); }

 void Tensor::zeros_() {
-  std::shared_ptr<of::one::MirroredTensor> local_tensor =
-      tensor_->AsMirroredTensor().GetPtrOrThrow();
+  std::shared_ptr<of::one::LocalTensor> local_tensor = tensor_->AsLocalTensor().GetPtrOrThrow();
  of::PhysicalRun([&](of::InstructionsBuilder* builder) -> of::Maybe<void> {
    JUST(builder->AccessBlobByCallback(
        local_tensor,
-        [](uint64_t of_blob_ptr) {
-          auto* of_blob = reinterpret_cast<of::OfBlob*>(of_blob_ptr);
-          of_blob->AsyncAutoMemset(0);
+        [](of::ep::Stream* stream,
+           const std::shared_ptr<of::vm::EagerBlobObject>& eager_blob_object) {
+          of::AutoMemset(stream, eager_blob_object->mut_dptr(), 0,
+                         eager_blob_object->ByteSizeOfBlobBody(), eager_blob_object->mem_case());
        },
        "mut"));
    return of::Maybe<void>::Ok();
@@ -85,14 +84,16 @@ void Tensor::zeros_() {
 Tensor Tensor::from_buffer(const void* buffer, const Shape& shape, const Device& device,
                           const DType& dtype) {
  Tensor tensor(shape, device, dtype);
-  std::shared_ptr<of::one::MirroredTensor> local_tensor =
-      tensor.tensor_->AsMirroredTensor().GetPtrOrThrow();
+  std::shared_ptr<of::one::LocalTensor> local_tensor =
+      tensor.tensor_->AsLocalTensor().GetPtrOrThrow();
  of::PhysicalRun([&](of::InstructionsBuilder* builder) -> of::Maybe<void> {
    return builder->AccessBlobByCallback(
        local_tensor,
-        [buffer, shape, dtype](uint64_t ofblob_ptr) {
-          CHECK_JUST(of::BlobBufferCopyUtil<void>::From(ofblob_ptr, buffer,
-                                                        shape.Count(0) * GetDTypeSize(dtype)));
+        [buffer, shape, dtype](of::ep::Stream* stream,
+                               const std::shared_ptr<of::vm::EagerBlobObject>& eager_blob_object) {
+          of::AutoMemcpy(stream, eager_blob_object->mut_dptr(), buffer,
+                         shape.Count(0) * GetDTypeSize(dtype), eager_blob_object->mem_case(),
+                         of::memory::MakeHostMemCase());
        },
        "mut");
  }).GetOrThrow();
@@ -101,14 +102,16 @@ Tensor Tensor::from_buffer(const void* buffer, const Shape& shape, const Device&

 template<typename T>
 void Tensor::copy_to(T* buffer) const {
-  std::shared_ptr<of::one::MirroredTensor> local_tensor =
-      tensor_->AsMirroredTensor().GetPtrOrThrow();
+  std::shared_ptr<of::one::LocalTensor> local_tensor = tensor_->AsLocalTensor().GetPtrOrThrow();
  const auto shape = this->shape();

-  const auto& Callback = [buffer, shape](uint64_t ofblob_ptr) {
-    CHECK_JUST(of::BlobBufferCopyUtil<T>::To(ofblob_ptr, buffer, shape.Count(0)));
+  const auto& Callback = [buffer, shape](
+                             of::ep::Stream* stream,
+                             const std::shared_ptr<of::vm::EagerBlobObject>& eager_blob_object) {
+    of::AutoMemcpy(stream, buffer, eager_blob_object->mut_dptr(), shape.Count(0) * sizeof(T),
+                   of::memory::MakeHostMemCase(), eager_blob_object->mem_case());
  };
-  auto btb = std::make_shared<of::BlockingThenBusy>(1);
+  auto btb = std::make_shared<of::BlockingThenBusy>();
  CHECK_JUST(of::PhysicalRun([&](of::InstructionsBuilder* builder) -> of::Maybe<void> {
    return builder->SyncAccessBlobByCallback(local_tensor, btb, Callback, "const");
  }));

--- a/oneflow/api/cpp/tests/api_test.cpp
+++ b/oneflow/api/cpp/tests/api_test.cpp
@@ -30,13 +30,8 @@ limitations under the License.

 namespace oneflow_api {

-namespace {
-
-std::mt19937 rng(std::random_device{}());
-
-}
-
 Shape RandomShape() {
+  thread_local static std::mt19937 rng(std::random_device{}());
  std::uniform_int_distribution<> dist_ndim(1, 4), dist_dims(16, 64);
  std::vector<std::int64_t> dims(dist_ndim(rng), 0);
  for (auto& x : dims) { x = dist_dims(rng); }
@@ -45,6 +40,7 @@ Shape RandomShape() {

 template<typename T>
 std::vector<T> RandomData(size_t size) {
+  thread_local static std::mt19937 rng(std::random_device{}());
  std::uniform_int_distribution<> dist(-100, 100);
  std::vector<T> data(size);
  for (auto& x : data) { x = static_cast<T>(dist(rng)); }

--- a/oneflow/api/cpp/tests/graph_test.cpp
+++ b/oneflow/api/cpp/tests/graph_test.cpp
@@ -64,27 +64,7 @@ TEST(Api, graph_cpu_test) {
  Forward(graph, device, 1);
 }

-#ifdef WITH_CUDA
-TEST(Api, graph_gpu_test) {
-  EnvScope scope;
-  Device device("cuda", 0);
-  Graph graph = LoadGraph(device);
-  Forward(graph, device);
-}
-
-TEST(Api, graph_multi_gpu_test) {
-  EnvScope scope;
-  Device device("cuda", 0);
-  Graph graph = LoadGraph(device);
-  Forward(graph, device);
-
-  Device device1("cuda", 1);
-  Graph graph1 = LoadGraph(device1);
-  Forward(graph1, device1);
-}
-#endif
-
-#ifdef WITH_ROCM
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
 TEST(Api, graph_gpu_test) {
  EnvScope scope;
  Device device("cuda", 0);
@@ -112,7 +92,7 @@ TEST(Api, graph_cpu_batching_test) {
  Forward(graph, device, 10);
 }

-#ifdef WITH_CUDA
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
 TEST(Api, graph_gpu_batching_test) {
  EnvScope scope;
  Device device("cuda", 0);

--- a/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.mlir
+++ b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.mlir
 module  {
  oneflow.job @MyGraph_0(%arg0: tensor<1x3xf32>) -> tensor<1x4xf32> {
    %output = "oneflow.input"(%arg0) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_0-input_0", output_lbns = ["_MyGraph_0-input_0/out"], scope_symbol_id = 4611686018427469823 : i64, shape = [1 : si64, 3 : si64]} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-    %output_0 = "oneflow.variable"() {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], nd_sbp = ["B"], op_name = "model.a", output_lbns = ["model.a/out"], scope_symbol_id = 4611686018427482111 : i64, shape = [3 : si64, 4 : si64]} : () -> tensor<3x4xf32>
-    %output_1 = "oneflow.variable"() {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], nd_sbp = ["B"], op_name = "model.b", output_lbns = ["model.b/out"], scope_symbol_id = 4611686018427494399 : i64, shape = [4 : si64]} : () -> tensor<4xf32>
+    %output_0 = "oneflow.variable"() {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], parallel = #sbp.parallel<[] -> [#sbp.B]>, op_name = "model.a", output_lbns = ["model.a/out"], scope_symbol_id = 4611686018427482111 : i64, shape = [3 : si64, 4 : si64]} : () -> tensor<3x4xf32>
+    %output_1 = "oneflow.variable"() {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], parallel = #sbp.parallel<[] -> [#sbp.B]>, op_name = "model.b", output_lbns = ["model.b/out"], scope_symbol_id = 4611686018427494399 : i64, shape = [4 : si64]} : () -> tensor<4xf32>
    %0 = "oneflow.matmul"(%output, %output_0) {alpha = 1.000000e+00 : f64, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], op_name = "model-matmul_0", output_lbns = ["model-matmul_0/out_0"], scope_symbol_id = 4611686018427486207 : i64, transpose_a = false, transpose_b = false} : (tensor<1x3xf32>, tensor<3x4xf32>) -> tensor<1x4xf32>
    %1 = "oneflow.broadcast_add"(%0, %output_1) {device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], op_name = "model-broadcast_add_1", output_lbns = ["model-broadcast_add_1/z_0"], scope_symbol_id = 4611686018427486207 : i64} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
    %output_2 = "oneflow.output"(%1) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_0-output_0", output_lbns = ["_MyGraph_0-output_0/out"], scope_symbol_id = 4611686018427469823 : i64, shape = [1 : si64, 4 : si64]} : (tensor<1x4xf32>) -> tensor<1x4xf32>

--- a/oneflow/api/cpp/tests/one_embedding_test.cpp
+++ b/oneflow/api/cpp/tests/one_embedding_test.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+#include "oneflow/api/cpp/tests/api_test.h"
+
+namespace oneflow_api {
+
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+TEST(Api, embedding_test) {
+  EnvScope scope;
+  Device device("cuda");
+  Graph graph = Graph::Load("/path/to/embedding", device);
+  int64_t batch_size = 10000;
+  int64_t num_features = 39;
+
+  std::vector<int64_t> data(batch_size * num_features);
+  std::fill(data.begin(), data.end(), 1);
+  std::vector<Tensor> inputs;
+  inputs.emplace_back(
+      Tensor::from_buffer(data.data(), Shape({batch_size, num_features}), device, DType::kInt64));
+
+  const auto& value = graph.Forward(inputs);
+
+  ASSERT_TRUE(value.IsTensor());
+  Tensor output = value.ToTensor();
+  Shape shape = output.shape();
+  ASSERT_EQ(shape.At(0), batch_size);
+  ASSERT_EQ(shape.At(1), 1);
+
+  std::vector<float> buf(batch_size);
+  output.copy_to(buf.data());
+}
+#endif
+
+}  // namespace oneflow_api
--- a/oneflow/api/cpp/tests/tensor_test.cpp
+++ b/oneflow/api/cpp/tests/tensor_test.cpp
@@ -25,16 +25,7 @@ TEST(Api, device) {
  auto device = Device("cpu");
  ASSERT_EQ(device.type(), "cpu");

-#ifdef WITH_CUDA
-  device = Device("cuda:0");
-  ASSERT_EQ(device.type(), "cuda");
-  ASSERT_EQ(device.device_id(), 0);
-
-  device = Device("cuda", 1);
-  ASSERT_EQ(device.type(), "cuda");
-  ASSERT_EQ(device.device_id(), 1);
-#endif
-#ifdef WITH_ROCM
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
  device = Device("cuda:0");
  ASSERT_EQ(device.type(), "cuda");
  ASSERT_EQ(device.device_id(), 0);

--- a/oneflow/api/python/autograd/autograd.cpp
+++ b/oneflow/api/python/autograd/autograd.cpp
@@ -16,9 +16,13 @@ limitations under the License.

 #include <pybind11/pybind11.h>
 #include <memory>
+#include <utility>
 #include <vector>
 #include "oneflow/api/python/of_api_registry.h"
+#include "oneflow/api/python/job_build/job_build_and_infer.h"
+#include "oneflow/core/common/throw.h"
 #include "oneflow/core/framework/dtype.h"
+#include "oneflow/core/framework/scope_util.h"
 #include "oneflow/core/framework/tensor.h"
 #include "oneflow/core/framework/tensor_tuple.h"
 #include "oneflow/core/autograd/autograd_engine.h"
@@ -26,6 +30,7 @@ limitations under the License.
 #include "oneflow/core/functional/functional.h"
 #include "oneflow/core/common/util.h"
 #include "oneflow/core/common/container_util.h"
+#include "oneflow/core/framework/saved_tensor_hooks.h"

 namespace oneflow {
 namespace autograd {
@@ -50,7 +55,8 @@ Maybe<one::TensorTuple> CheckAndInitOutGrads(const one::TensorTuple& outputs,
      << " gradients";
  for (int i = 0; i < outputs.size(); ++i) {
    CHECK_OR_RETURN(outputs.at(i)->requires_grad())
-        << "All output tensors `.requires_grad` should be true";
+        << "\nRuntimeError: element " << i
+        << " of tensors does not require grad and does not have a grad_fn";
    if (!outputs.at(i)->grad_fn_node()) {
      CHECK_OR_RETURN(outputs.at(i)->is_leaf())
          << "output[" << i << "] doesn't have grad_fn and it is not leaf tensor!\n"
@@ -66,7 +72,6 @@ Maybe<one::TensorTuple> CheckAndInitOutGrads(const one::TensorTuple& outputs,
      CHECK_OR_RETURN(*(outputs.at(i)->shape()) == *(out_grads.at(i)->shape()))
          << "out_grad's shape must be same as output's (" << outputs.at(i)->shape()->ToString()
          << " vs " << out_grads.at(i)->shape()->ToString() << ")";
-      // if (outputs.at(i)->dtype() != out_grads.at(i)->dtype()) {
      if (JUST(oneflow::VectorAt(outputs, i))->dtype()
          != JUST(oneflow::VectorAt(out_grads, i))->dtype()) {
        JUST(oneflow::VectorAt(*gradients, i)) =
@@ -76,6 +81,7 @@ Maybe<one::TensorTuple> CheckAndInitOutGrads(const one::TensorTuple& outputs,
      }
    }
  }
+  if (LazyMode::is_enabled()) { JUST(MarkOutputGradients(outputs, *gradients)); }
  return gradients;
 }

@@ -83,6 +89,7 @@ Maybe<one::TensorTuple> CheckAndInitOutGrads(const one::TensorTuple& outputs,

 Maybe<one::TensorTuple> Backward(const one::TensorTuple& outputs, const one::TensorTuple& out_grads,
                                 bool retain_graph, bool create_graph) {
+  BackwardPassScopeGuard backward_guard;
  if (create_graph) { retain_graph = true; }
  std::shared_ptr<one::TensorTuple> gradients = JUST(CheckAndInitOutGrads(outputs, out_grads));
  JUST(one::GetThreadLocalAutogradEngine()->RunBackwardAndSaveGrads4LeafTensorIf(
@@ -93,6 +100,7 @@ Maybe<one::TensorTuple> Backward(const one::TensorTuple& outputs, const one::Ten
 Maybe<one::TensorTuple> Grad(const one::TensorTuple& outputs, const one::TensorTuple& inputs,
                             const one::TensorTuple& out_grads, bool retain_graph,
                             bool create_graph) {
+  BackwardPassScopeGuard backward_guard;
  if (create_graph) { retain_graph = true; }
  if (inputs.empty()) { return Backward(outputs, out_grads, retain_graph, create_graph); }
  CHECK_OR_RETURN(std::all_of(
@@ -104,9 +112,80 @@ Maybe<one::TensorTuple> Grad(const one::TensorTuple& outputs, const one::TensorT
      outputs, inputs, *gradients, retain_graph, create_graph);
 }

+namespace py = pybind11;
+
+class PySavedTensorHook final : public one::SavedTensorHook {
+ public:
+  PySavedTensorHook(const py::function& pack_hook, const py::function& unpack_hook)
+      : pack_hook_(pack_hook), unpack_hook_(unpack_hook) {}
+
+  void pack(const std::shared_ptr<one::Tensor>& tensor) {
+    py::gil_scoped_acquire acquire;
+    py::object packed = pack_hook_(tensor);
+    data_ = packed.release().ptr();
+  }
+  std::shared_ptr<one::Tensor> unpack() {
+    py::gil_scoped_acquire acquire;
+    py::object obj = py::cast<py::object>(data_);
+    py::object x = unpack_hook_(obj);
+    std::shared_ptr<one::Tensor> tensor;
+    try {
+      tensor = py::cast<std::shared_ptr<one::Tensor>>(x);
+    } catch (const py::cast_error& e) {
+      THROW(RuntimeError) << "unpack_hook should return a Tensor, but got `"
+                          << py::str(x.get_type()).cast<std::string>() << "` instead";
+    }
+    return tensor;
+  }
+
+ private:
+  PyObject* data_ = nullptr;
+  py::function pack_hook_;
+  py::function unpack_hook_;
+};
+
+class PySavedTensorHookCreator final : public one::SavedTensorHookCreator {
+ public:
+  std::unique_ptr<one::SavedTensorHook> new_saved_tensor_hook() const override {
+    if (hooks_.empty()) { return nullptr; }
+    return std::make_unique<PySavedTensorHook>(hooks_.back().first, hooks_.back().second);
+  }
+  void append_new_hooks(const py::function& pack_hook, const py::function& unpack_hook) {
+    hooks_.emplace_back(pack_hook, unpack_hook);
+  }
+  void pop_hooks() {
+    CHECK_OR_THROW(!hooks_.empty()) << "pop_hooks should not be called when there are no hooks";
+    hooks_.pop_back();
+  }
+
+ private:
+  small_vector<std::pair<py::function, py::function>, 1> hooks_;
+};
+
 ONEFLOW_API_PYBIND11_MODULE("autograd", m) {
  m.def("backward", &Backward);
  m.def("grad", &Grad);
+  m.def_submodule("graph")
+      .def("register_saved_tensors_hook_manager",
+           []() {
+             Singleton<one::SavedTensorHookCreator>::SetAllocated(new PySavedTensorHookCreator());
+           })
+      .def("append_new_hooks",
+           [](const py::function& pack_hook, const py::function& unpack_hook) {
+             PySavedTensorHookCreator* creator = dynamic_cast<PySavedTensorHookCreator*>(
+                 Singleton<one::SavedTensorHookCreator>::Get());
+             CHECK_NOTNULL_OR_THROW(creator)
+                 << "`register_saved_tensors_hook_manager` should be called "
+                    "before calling `append_new_hooks`";
+             creator->append_new_hooks(pack_hook, unpack_hook);
+           })
+      .def("pop_hooks", []() {
+        PySavedTensorHookCreator* creator =
+            dynamic_cast<PySavedTensorHookCreator*>(Singleton<one::SavedTensorHookCreator>::Get());
+        CHECK_NOTNULL_OR_THROW(creator) << "`register_saved_tensors_hook_manager` should be called "
+                                           "before calling `pop_hooks`";
+        creator->pop_hooks();
+      });
 }

 }  // namespace autograd

--- a/oneflow/api/python/autograd/autograd_function.cpp
+++ b/oneflow/api/python/autograd/autograd_function.cpp
@@ -38,17 +38,22 @@ Maybe<one::TensorTuple> UnpackTensorTuple(const py::object& input) {
    tp.emplace_back(input.cast<std::shared_ptr<one::Tensor>>());
  } else if (py::isinstance<py::tuple>(input)) {
    auto tuple = input.cast<py::tuple>();
+    tp.resize(tuple.size());
    for (int i = 0; i < tuple.size(); ++i) {
      PyObject* obj = tuple[i].ptr();
-      if (!one::PyTensor_Check(obj)) {
+      if (obj == Py_None) {
+        // do nothing
+      } else if (one::PyTensor_Check(obj)) {
+        tp[i] = one::PyTensor_Unpack(obj);
+      } else {
        return Error::RuntimeError()
-               << "expected Tensor as element " << i << ", but got "
+               << "expected Tensor or None as element " << i << ", but got "
               << one::functional::PyStringAsString(PyObject_Str((PyObject*)Py_TYPE(obj)));
      }
-      tp.emplace_back(one::PyTensor_Unpack(obj));
    }
  } else {
-    return Error::RuntimeError() << "Only support tensor or list of tensors";
+    return Error::RuntimeError()
+           << "autograd.Function's output only support tensor or list of tensors";
  }
  return tp;
 }
@@ -90,22 +95,6 @@ ONEFLOW_API_PYBIND11_MODULE("autograd", m) {
                        *input_tensor_tuple));
                    return PackTensorTuple(*res);
                  });
-
-  py::class_<FunctionAutoGradCaptureState, std::shared_ptr<FunctionAutoGradCaptureState>>(
-      m, "FunctionAutoGradCaptureState")
-      .def(py::init([]() { return std::make_shared<FunctionAutoGradCaptureState>(); }))
-      .def("save_for_backward",
-           [](FunctionAutoGradCaptureState& ctx, const py::args& input) {
-             const auto& tensors = UnpackTensorTuple(input).GetOrThrow();
-             for (const auto& tensor : tensors) { ctx.SaveTensorForBackward(tensor); }
-           })
-      .def_property_readonly(
-          "saved_tensors",
-          [](const FunctionAutoGradCaptureState& ctx) { return py::cast(ctx.SavedTensors()); })
-      .def("mark_non_differentiable", [](FunctionAutoGradCaptureState& ctx, const py::args& input) {
-        const auto& tensors = UnpackTensorTuple(input).GetOrThrow();
-        for (const auto& tensor : tensors) { ctx.MarkNonDifferentiable(tensor); }
-      });
 }

 }  // namespace one

--- a/oneflow/api/python/autograd/autograd_function_state.cpp
+++ b/oneflow/api/python/autograd/autograd_function_state.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/api/python/autograd/autograd_function_state.h"
+
+#include <pybind11/pybind11.h>
+#include "oneflow/api/python/exception/exception.h"
+#include "oneflow/api/python/functional/common.h"
+#include "oneflow/api/python/of_api_registry.h"
+
+namespace py = pybind11;
+namespace oneflow {
+namespace one {
+namespace {
+inline FunctionAutoGradCaptureState* CheckAndGetStateData(PyAutogradFunctionState* state) {
+  if (!state->data.lock()) {
+    PyErr_Format(PyExc_RuntimeError, "Data is deallocated. Please don't hold context outside "
+                                     "autograd.Function.forward or autograd.Function.backward");
+    return nullptr;
+  }
+  return state->data.lock().get();
+}
+}  // namespace
+
+#if PY_VERSION_HEX < 0x03070000
+#define PYGETSET_NAME(name) const_cast<char*>(name)
+#else
+#define PYGETSET_NAME(name) (name)
+#endif
+
+#define PY_XINCREF(p) (({ Py_XINCREF(p); }), (p))
+
+static PyObject* PyAutogradFunctionState_new(PyTypeObject* type, PyObject* args, PyObject* kwds) {
+  PyAutogradFunctionState* self = (PyAutogradFunctionState*)type->tp_alloc(type, 0);
+  if (self != NULL) {
+    self->dynamic_attr_dict = PyDict_New();
+    if (self->dynamic_attr_dict == NULL) {
+      Py_DECREF(self);
+      return NULL;
+    }
+  }
+  return (PyObject*)self;
+}
+
+static void PyAutogradFunctionState_dealloc(PyAutogradFunctionState* self) {
+  Py_XDECREF(self->dynamic_attr_dict);
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+// PyMethodDef start
+static PyObject* PyAutogradFunctionState_save_for_backward(PyObject* self, PyObject* args) {
+  HANDLE_ERRORS
+  auto* _self = (PyAutogradFunctionState*)self;
+  if (!functional::PyTensorSequenceCheck(args)) {
+    return PyErr_Format(PyExc_TypeError, "save_for_backward() only support Tensor or Tensors");
+  }
+  const std::vector<std::shared_ptr<Tensor>>& tensor_list =
+      functional::PyUnpackTensorSequence(args);
+  for (const auto& tensor : tensor_list) {
+    CheckAndGetStateData(_self)->SaveTensorForBackward(tensor);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_ERRORS
+}
+
+static PyObject* PyAutogradFunctionState_mark_non_differentiable(PyObject* self, PyObject* args) {
+  HANDLE_ERRORS
+  auto* _self = (PyAutogradFunctionState*)self;
+  if (!functional::PyTensorSequenceCheck(args)) {
+    return PyErr_Format(PyExc_TypeError, "save_for_backward() only support Tensor or Tensors");
+  }
+  const std::vector<std::shared_ptr<Tensor>>& tensor_list =
+      functional::PyUnpackTensorSequence(args);
+  for (const auto& tensor : tensor_list) {
+    CheckAndGetStateData(_self)->MarkNonDifferentiable(tensor);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_ERRORS
+}
+
+static PyObject* PyAutogradFunctionState_is_data_valid(PyObject* self) {
+  auto* _self = (PyAutogradFunctionState*)self;
+  return functional::CastToPyObject(_self->data.lock() != nullptr);
+}
+
+static PyMethodDef PyAutogradFunctionState_methods[] = {
+    {"save_for_backward", (PyCFunction)PyAutogradFunctionState_save_for_backward, METH_VARARGS,
+     NULL},
+    {"mark_non_differentiable", (PyCFunction)PyAutogradFunctionState_mark_non_differentiable,
+     METH_VARARGS, NULL},
+    {"_is_data_valid", (PyCFunction)PyAutogradFunctionState_is_data_valid, METH_NOARGS, NULL},
+    {NULL} /* Sentinel */
+};
+// PyMethodDef end
+
+// PyAutogradFunctionState_getset start
+static PyObject* PyAutogradFunctionState_saved_tensors(PyObject* self, void*) {
+  auto* _self = (PyAutogradFunctionState*)self;
+  return functional::CastToPyObject<Maybe<TensorTuple>>(
+      CheckAndGetStateData(_self)->SavedTensors());
+}
+
+static PyObject* PyAutogradFunctionState_get_dict(PyObject* self, PyObject* args) {
+  HANDLE_ERRORS
+  auto* _self = (PyAutogradFunctionState*)self;
+  return _self->dynamic_attr_dict;
+  Py_RETURN_NONE;
+  END_HANDLE_ERRORS
+}
+
+static PyGetSetDef PyAutogradFunctionState_properties[] = {
+    {PYGETSET_NAME("saved_tensors"), (getter)PyAutogradFunctionState_saved_tensors, NULL, NULL,
+     NULL},
+    {PYGETSET_NAME("__dict__"), (getter)PyAutogradFunctionState_get_dict, NULL, NULL, NULL},
+    {NULL} /* Sentinel */
+};
+// PyAutogradFunctionState_getset end
+
+PyObject* PyAutogradFunctionState_getattro(PyObject* self, PyObject* attr) {
+  PyObject* res = NULL;
+  res = PyDict_GetItem(((PyAutogradFunctionState*)self)->dynamic_attr_dict, attr);
+  if (!res) {
+    // Not found attr in dynamic_attr_dict, try to find it in tp_dict
+    res = PyObject_GenericGetAttr(self, attr);
+    if (!res) {
+      return PyErr_Format(PyExc_AttributeError, "attribute %s not found", PyUnicode_AsUTF8(attr));
+    }
+  }
+  return res;
+}
+
+int PyAutogradFunctionState_setattro(PyObject* self, PyObject* attr, PyObject* value) {
+  auto* _self = (PyAutogradFunctionState*)self;
+  return PyDict_SetItem(_self->dynamic_attr_dict, attr, value);
+}
+
+PyTypeObject PyAutogradFunctionState_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0) "oneflow.autograd.Function.FunctionCtx", /* tp_name */
+    sizeof(PyAutogradFunctionState),                                        /* tp_basicsize */
+    0,                                                                      /* tp_itemsize */
+    (destructor)PyAutogradFunctionState_dealloc,                            /* tp_dealloc */
+    0,                                                    /* tp_vectorcall_offset */
+    NULL,                                                 /* tp_getattr */
+    NULL,                                                 /* tp_setattr */
+    NULL,                                                 /* tp_reserved */
+    NULL,                                                 /* tp_repr */
+    NULL,                                                 /* tp_as_number */
+    NULL,                                                 /* tp_as_sequence */
+    NULL,                                                 /* tp_as_mapping */
+    NULL,                                                 /* tp_hash  */
+    NULL,                                                 /* tp_call */
+    NULL,                                                 /* tp_str */
+    PyAutogradFunctionState_getattro,                     /* tp_getattro */
+    PyAutogradFunctionState_setattro,                     /* tp_setattro */
+    NULL,                                                 /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,             /* tp_flags */
+    NULL,                                                 /* tp_doc */
+    NULL,                                                 /* tp_traverse */
+    NULL,                                                 /* tp_clear */
+    NULL,                                                 /* tp_richcompare */
+    0,                                                    /* tp_weaklistoffset */
+    NULL,                                                 /* tp_iter */
+    NULL,                                                 /* tp_iternext */
+    PyAutogradFunctionState_methods,                      /* tp_methods */
+    NULL,                                                 /* tp_members */
+    PyAutogradFunctionState_properties,                   /* tp_getset */
+    0,                                                    /* tp_base */
+    NULL,                                                 /* tp_dict */
+    NULL,                                                 /* tp_descr_get */
+    NULL,                                                 /* tp_descr_set */
+    offsetof(PyAutogradFunctionState, dynamic_attr_dict), /* tp_dictoffset */
+    NULL,                                                 /* tp_init */
+    NULL,                                                 /* tp_alloc */
+    PyAutogradFunctionState_new,                          /* tp_new */
+    NULL,                                                 /* tp_free */
+};
+
+PyObject* PyAutogradFunctionState_NewFromPtr(
+    const std::shared_ptr<FunctionAutoGradCaptureState>& data) {
+  if (!data) { Py_RETURN_NONE; }
+  if (data->pyobject()) { return PY_XINCREF((PyObject*)data->pyobject()); }
+  auto* self = (PyAutogradFunctionState*)(PyObject_CallObject(
+      (PyObject*)&PyAutogradFunctionState_Type, NULL));
+  if (self) {
+    PY_XINCREF(self);
+    self->data = data;
+    CheckAndGetStateData(self)->set_pyobject_ptr(
+        std::unique_ptr<void, void (*)(void*)>(self, [](void* ptr) { Py_DECREF((PyObject*)ptr); }));
+  }
+  return (PyObject*)self;
+}
+
+ONEFLOW_API_PYBIND11_MODULE("autograd.Function", m) {
+  if (PyType_Ready(&PyAutogradFunctionState_Type) < 0) { return; }
+  Py_INCREF(&PyAutogradFunctionState_Type);
+  if (PyModule_AddObject(m.ptr(), "FunctionCtx", (PyObject*)&PyAutogradFunctionState_Type) < 0) {
+    return;
+  }
+}
+
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/api/python/autograd/autograd_function_state.h
+++ b/oneflow/api/python/autograd/autograd_function_state.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_API_PYTHON_AUTOGRAD_AUTOGRAD_FUNCTION_STATE_H_
+#define ONEFLOW_API_PYTHON_AUTOGRAD_AUTOGRAD_FUNCTION_STATE_H_
+
+#include <Python.h>
+#include <pybind11/pybind11.h>
+
+#include "oneflow/core/framework/op_expr_grad_function.h"
+
+namespace oneflow {
+namespace one {
+
+typedef struct {
+  PyObject_HEAD;
+  PyObject* dynamic_attr_dict;
+  std::weak_ptr<FunctionAutoGradCaptureState> data;
+} PyAutogradFunctionState;
+
+extern PyTypeObject PyAutogradFunctionState_Type;
+
+inline bool PyAutogradFunctionState_Check(PyObject* state) {
+  return PyObject_TypeCheck(state, &PyAutogradFunctionState_Type);
+}
+
+PyObject* PyAutogradFunctionState_NewFromPtr(
+    const std::shared_ptr<FunctionAutoGradCaptureState>& data);
+
+}  // namespace one
+}  // namespace oneflow
+
+#endif  // ONEFLOW_API_PYTHON_AUTOGRAD_AUTOGRAD_FUNCTION_STATE_H_
--- a/oneflow/api/python/caster/autograd_function_state.h
+++ b/oneflow/api/python/caster/autograd_function_state.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_API_PYTHON_CASTER_AUTOGRAD_FUNCTION_STATE_H_
+#define ONEFLOW_API_PYTHON_CASTER_AUTOGRAD_FUNCTION_STATE_H_
+
+#include <pybind11/pybind11.h>
+
+#include "oneflow/api/python/caster/common.h"
+#include "oneflow/api/python/autograd/autograd_function_state.h"
+
+namespace py = pybind11;
+
+namespace pybind11 {
+namespace detail {
+
+template<typename T>
+struct autograd_function_state_type_caster {
+ public:
+  bool load(handle src, bool convert) {
+    using namespace oneflow::one;
+    value_ = nullptr;
+    if (!src) { return false; }
+    if (src.is_none()) { return true; }
+    if (!PyAutogradFunctionState_Check(src.ptr())) { return false; }
+    value_ = ((PyAutogradFunctionState*)src.ptr())->data;
+    return true;
+  }
+
+  template<typename U>
+  static handle cast(U&& src, return_value_policy policy, handle parent) {
+    using namespace oneflow::one;
+    return reinterpret_steal<object>(
+               PyAutogradFunctionState_NewFromPtr(
+                   std::const_pointer_cast<FunctionAutoGradCaptureState>(src)))
+        .release();
+  }
+
+  operator std::shared_ptr<T>*() { return &value_; }
+  operator std::shared_ptr<T>&() { return value_; }
+  operator std::shared_ptr<T>&&() && { return std::move(value_); }
+
+  static constexpr auto name = _("autograd_function_state");
+
+ protected:
+  std::shared_ptr<T> value_;
+};
+
+template<>
+struct type_caster<std::shared_ptr<oneflow::one::FunctionAutoGradCaptureState>>
+    : public autograd_function_state_type_caster<oneflow::one::FunctionAutoGradCaptureState> {};
+template<>
+struct type_caster<std::shared_ptr<const oneflow::one::FunctionAutoGradCaptureState>>
+    : public autograd_function_state_type_caster<const oneflow::one::FunctionAutoGradCaptureState> {
+};
+
+}  // namespace detail
+}  // namespace pybind11
+
+#endif  // ONEFLOW_API_PYTHON_CASTER_AUTOGRAD_FUNCTION_STATE_H_
--- a/oneflow/api/python/caster/common.h
+++ b/oneflow/api/python/caster/common.h
@@ -13,8 +13,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#include <type_traits>
+#ifndef ONEFLOW_API_PYTHON_CASTER_COMMON_H_
+#define ONEFLOW_API_PYTHON_CASTER_COMMON_H_

+#include <type_traits>
 #include <pybind11/pybind11.h>

 namespace pybind11 {
@@ -49,3 +51,5 @@ using IsSupportedByPybind11WhenInsideSharedPtr =

 }  // namespace detail
 }  // namespace pybind11
+
+#endif  // ONEFLOW_API_PYTHON_CASTER_COMMON_H_
--- a/oneflow/api/python/caster/maybe.h
+++ b/oneflow/api/python/caster/maybe.h
@@ -13,6 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#ifndef ONEFLOW_API_PYTHON_CASTER_MAYBE_H_
+#define ONEFLOW_API_PYTHON_CASTER_MAYBE_H_
 #include <pybind11/pybind11.h>

 #include "oneflow/api/python/caster/common.h"
@@ -84,7 +86,7 @@ template<>
 struct maybe_caster<Maybe<void>> {
  template<typename T>
  static handle cast(T&& src, return_value_policy policy, handle parent) {
-    if (!src.IsOk()) { oneflow::ThrowError(src.error()); }
+    if (!src.IsOk()) { oneflow::ThrowError(src.stacked_error()); }
    return none().inc_ref();
  }

@@ -104,3 +106,5 @@ struct type_caster<Maybe<T>> : public maybe_caster<Maybe<T>> {};

 }  // namespace detail
 }  // namespace pybind11
+
+#endif  // ONEFLOW_API_PYTHON_CASTER_MAYBE_H_
--- a/oneflow/api/python/caster/optional.h
+++ b/oneflow/api/python/caster/optional.h
@@ -13,6 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
+#ifndef ONEFLOW_API_PYTHON_CASTER_OPTIONAL_H_
+#define ONEFLOW_API_PYTHON_CASTER_OPTIONAL_H_
+
 #include <pybind11/pybind11.h>

 #include "oneflow/api/python/caster/common.h"
@@ -109,3 +112,5 @@ struct type_caster<Optional<T>> : public oneflow_optional_caster<Optional<T>> {}

 }  // namespace detail
 }  // namespace pybind11
+
+#endif  // ONEFLOW_API_PYTHON_CASTER_OPTIONAL_H_