Oneflow 0.8 for DCU

21d47d0e · yuguo · 21d47d0e · 21d47d0e · 21d47d0e · 21d47d0e
Commit 21d47d0e authored Oct 24, 2022 by yuguo
20 changed files
--- a/oneflow/api/cpp/framework/shape.h
+++ b/oneflow/api/cpp/framework/shape.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_API_CPP_FRAMEWORK_SHAPE_H_
+#define ONEFLOW_API_CPP_FRAMEWORK_SHAPE_H_
+#include <memory>
+#include <vector>
+namespace oneflow {
+class Shape;
+}
+namespace oneflow_api {
+class Shape final {
+  friend class Tensor;
+ public:
+  Shape();
+  explicit Shape(const std::vector<int64_t>& dim_vec);
+  Shape(const std::initializer_list<int64_t>& dim_vec);
+  ~Shape() = default;
+  Shape& operator=(const Shape& shape);
+  [[nodiscard]] bool operator==(const Shape& rhs) const;
+  [[nodiscard]] bool operator!=(const Shape& rhs) const;
+  void Set(int64_t index, int64_t val);
+  [[nodiscard]] int64_t elem_cnt() const;
+  [[nodiscard]] int64_t At(int64_t index) const;
+  [[nodiscard]] int64_t NumAxes() const;
+  [[nodiscard]] int64_t Count(int64_t begin_axis, int64_t end_axis) const;
+  [[nodiscard]] int64_t Count(int64_t begin_axis) const;
+ private:
+  std::shared_ptr<oneflow::Shape> shape_ = nullptr;
+  friend std::ostream& operator<<(std::ostream&, const Shape&);
+};
+}  // namespace oneflow_api
+#endif  // ONEFLOW_API_CPP_FRAMEWORK_SHAPE_H_
--- a/oneflow/api/cpp/framework/tensor.cpp
+++ b/oneflow/api/cpp/framework/tensor.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/api/cpp/framework/tensor.h"
+#include "oneflow/api/cpp/framework/device.h"
+#include "oneflow/api/cpp/framework/dtype.h"
+#include "oneflow/api/cpp/framework/shape.h"
+#include "oneflow/core/common/data_type.pb.h"
+#include "oneflow/core/functional/functional.h"
+#include "oneflow/core/framework/dtype.h"
+#include "oneflow/core/job/lazy_mode.h"
+#include "oneflow/core/framework/instructions_builder.h"
+#include "oneflow/core/register/ofblob.h"
+#include "oneflow/api/common/ofblob.h"
+#include "oneflow/core/framework/dtype.h"
+#include "oneflow/core/vm/virtual_machine.h"
+namespace oneflow_api {
+namespace of = oneflow;
+namespace functional = of::one::functional;
+Tensor::Tensor(const Shape& shape, const Device& device, const DType& dtype) {
+  of::LazyMode::Guard lazy_mode_disabled_guard(/*is_enabled*/ false);
+  tensor_ = functional::Empty(*shape.shape_,
+                              of::DType::Get(static_cast<of::DataType>(dtype)).GetOrThrow(),
+                              *device.device_, /*pin_memory=*/false)
+                .GetPtrOrThrow();
+}
+Tensor::Tensor(const std::shared_ptr<oneflow::one::Tensor>& tensor) : tensor_(tensor) {}
+Tensor::Tensor(const Tensor& tensor) : tensor_(tensor.tensor_) {}
+Tensor::Tensor(Tensor&& tensor) noexcept : tensor_(std::move(tensor.tensor_)) {}
+Tensor& Tensor::operator=(const Tensor& tensor) {
+  if (&tensor == this) { return *this; }
+  tensor_ = tensor.tensor_;
+  return *this;
+}
+Tensor& Tensor::operator=(Tensor&& tensor) noexcept {
+  if (&tensor == this) { return *this; }
+  tensor_ = std::move(tensor.tensor_);
+  return *this;
+}
+Shape Tensor::shape() const {
+  const auto shape_ = tensor_->shape();
+  return Shape(std::vector<int64_t>(shape_->dim_vec().begin(), shape_->dim_vec().end()));
+}
+Device Tensor::device() const {
+  const auto device_ = tensor_->device().GetOrThrow();
+  return Device(device_->type(), device_->device_id());
+}
+DType Tensor::dtype() const { return static_cast<DType>(tensor_->dtype()->data_type()); }
+void Tensor::zeros_() {
+  std::shared_ptr<of::one::MirroredTensor> local_tensor =
+      tensor_->AsMirroredTensor().GetPtrOrThrow();
+  of::PhysicalRun([&](of::InstructionsBuilder* builder) -> of::Maybe<void> {
+    JUST(builder->AccessBlobByCallback(
+        local_tensor,
+        [](uint64_t of_blob_ptr) {
+          auto* of_blob = reinterpret_cast<of::OfBlob*>(of_blob_ptr);
+          of_blob->AsyncAutoMemset(0);
+        },
+        "mut"));
+    return of::Maybe<void>::Ok();
+  }).GetOrThrow();
+}
+Tensor Tensor::from_buffer(const void* buffer, const Shape& shape, const Device& device,
+                           const DType& dtype) {
+  Tensor tensor(shape, device, dtype);
+  std::shared_ptr<of::one::MirroredTensor> local_tensor =
+      tensor.tensor_->AsMirroredTensor().GetPtrOrThrow();
+  of::PhysicalRun([&](of::InstructionsBuilder* builder) -> of::Maybe<void> {
+    return builder->AccessBlobByCallback(
+        local_tensor,
+        [buffer, shape, dtype](uint64_t ofblob_ptr) {
+          CHECK_JUST(of::BlobBufferCopyUtil<void>::From(ofblob_ptr, buffer,
+                                                        shape.Count(0) * GetDTypeSize(dtype)));
+        },
+        "mut");
+  }).GetOrThrow();
+  return tensor;
+}
+template<typename T>
+void Tensor::copy_to(T* buffer) const {
+  std::shared_ptr<of::one::MirroredTensor> local_tensor =
+      tensor_->AsMirroredTensor().GetPtrOrThrow();
+  const auto shape = this->shape();
+  const auto& Callback = [buffer, shape](uint64_t ofblob_ptr) {
+    CHECK_JUST(of::BlobBufferCopyUtil<T>::To(ofblob_ptr, buffer, shape.Count(0)));
+  };
+  auto btb = std::make_shared<of::BlockingThenBusy>(1);
+  CHECK_JUST(of::PhysicalRun([&](of::InstructionsBuilder* builder) -> of::Maybe<void> {
+    return builder->SyncAccessBlobByCallback(local_tensor, btb, Callback, "const");
+  }));
+  TRY(btb->WaitUntilCntEqualZero(of::VirtualMachine::GetPredicatorNoMoreInstructionsFinished()))
+      .GetOrThrow();
+}
+const std::shared_ptr<oneflow::one::Tensor>& Tensor::__internal_tensor() const { return tensor_; }
+#define REGISTER_TENSOR_COPY_TO(cpp_dtype) \
+  template void Tensor::copy_to<cpp_dtype>(cpp_dtype * buffer) const;
+REGISTER_TENSOR_COPY_TO(float)
+REGISTER_TENSOR_COPY_TO(double)
+REGISTER_TENSOR_COPY_TO(bool)
+REGISTER_TENSOR_COPY_TO(int8_t)
+REGISTER_TENSOR_COPY_TO(int32_t)
+REGISTER_TENSOR_COPY_TO(int64_t)
+}  // namespace oneflow_api
--- a/oneflow/api/cpp/framework/tensor.h
+++ b/oneflow/api/cpp/framework/tensor.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_API_CPP_FRAMEWORK_TENSOR_H_
+#define ONEFLOW_API_CPP_FRAMEWORK_TENSOR_H_
+#include <memory>
+#include "device.h"
+#include "shape.h"
+#include "dtype.h"
+namespace oneflow {
+namespace one {
+class Tensor;
+}
+}  // namespace oneflow
+namespace oneflow_api {
+class Tensor final {
+  friend class Graph;
+ public:
+  explicit Tensor(const Shape& shape = Shape(), const Device& device = Device("cpu"),
+                  const DType& dtype = DType::kFloat);
+  explicit Tensor(const std::shared_ptr<oneflow::one::Tensor>& tensor);
+  Tensor(const Tensor& tensor);
+  Tensor(Tensor&& tensor) noexcept;
+  ~Tensor() = default;
+  Tensor& operator=(const Tensor& tensor);
+  Tensor& operator=(Tensor&& tensor) noexcept;
+  [[nodiscard]] Shape shape() const;
+  [[nodiscard]] Device device() const;
+  [[nodiscard]] DType dtype() const;
+  void zeros_();
+  // You should never call __internal_tensor() directly.
+  [[nodiscard]] const std::shared_ptr<oneflow::one::Tensor>& __internal_tensor() const;
+  template<typename T>
+  void copy_to(T* buffer) const;
+  [[nodiscard]] static Tensor from_buffer(const void* buffer, const Shape& shape,
+                                          const Device& device, const DType& dtype);
+ private:
+  std::shared_ptr<oneflow::one::Tensor> tensor_ = nullptr;
+};
+}  // namespace oneflow_api
+#endif  // ONEFLOW_API_CPP_FRAMEWORK_TENSOR_H_
--- a/oneflow/api/cpp/nn.h
+++ b/oneflow/api/cpp/nn.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_API_CPP_NN_H_
+#define ONEFLOW_API_CPP_NN_H_
+#include "nn/functional/activation.h"
+#endif  // ONEFLOW_API_CPP_NN_H_
--- a/oneflow/api/cpp/nn/functional/activation.cpp
+++ b/oneflow/api/cpp/nn/functional/activation.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/api/cpp/nn/functional/activation.h"
+#include "oneflow/core/functional/functional.h"
+namespace oneflow_api {
+namespace nn {
+namespace of = oneflow;
+namespace functional = of::one::functional;
+Tensor relu(const Tensor& tensor) {
+  return Tensor(functional::Relu(tensor.__internal_tensor(), false).GetPtrOrThrow());
+}
+}  // namespace nn
+}  // namespace oneflow_api
--- a/oneflow/api/cpp/nn/functional/activation.h
+++ b/oneflow/api/cpp/nn/functional/activation.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_API_CPP_NN_FUNCTIONAL_ACTIVATION_H_
+#define ONEFLOW_API_CPP_NN_FUNCTIONAL_ACTIVATION_H_
+#include "../../framework.h"
+namespace oneflow_api {
+namespace nn {
+Tensor relu(const Tensor& tensor);
+}
+}  // namespace oneflow_api
+#endif  // ONEFLOW_API_CPP_NN_FUNCTIONAL_ACTIVATION_H_
--- a/oneflow/api/cpp/tests/api_test.cpp
+++ b/oneflow/api/cpp/tests/api_test.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/api/cpp/tests/api_test.h"
+#include <cstddef>
+#include <random>
+#include <string>
+#ifdef __linux__
+#include <unistd.h>  // readlink
+#elif defined(__APPLE__)
+#include <mach-o/dyld.h>  //  _NSGetExecutablePath
+#endif
+namespace oneflow_api {
+namespace {
+std::mt19937 rng(std::random_device{}());
+}
+Shape RandomShape() {
+  std::uniform_int_distribution<> dist_ndim(1, 4), dist_dims(16, 64);
+  std::vector<std::int64_t> dims(dist_ndim(rng), 0);
+  for (auto& x : dims) { x = dist_dims(rng); }
+  return Shape(dims);
+}
+template<typename T>
+std::vector<T> RandomData(size_t size) {
+  std::uniform_int_distribution<> dist(-100, 100);
+  std::vector<T> data(size);
+  for (auto& x : data) { x = static_cast<T>(dist(rng)); }
+  return data;
+}
+#define REGISTER_RANDOM_DATA(cpp_dtype) template std::vector<cpp_dtype> RandomData(size_t size);
+REGISTER_RANDOM_DATA(float)
+REGISTER_RANDOM_DATA(double)
+REGISTER_RANDOM_DATA(int8_t)
+REGISTER_RANDOM_DATA(int32_t)
+REGISTER_RANDOM_DATA(int64_t)
+std::string GetExeDir() {
+  const size_t path_max_size = 4096;  // PATH_MAX = 4096 on linux
+  char result[path_max_size];
+  const auto get_dir_from_path = [](char result[], size_t count) -> std::string {
+    std::string exe_path(result, (count > 0) ? count : 0);
+    // string(path).rfind('/') will never be string::npos on linux or macos.
+    return exe_path.substr(0, exe_path.rfind('/'));
+  };
+#ifdef __linux__
+  ssize_t count = readlink("/proc/self/exe", result, path_max_size);
+  return get_dir_from_path(result, count);
+#elif defined(__APPLE__)
+  uint32_t count = path_max_size;
+  CHECK_EQ(_NSGetExecutablePath(result, &count), 0) << "Fail to get executable file path.";
+  return get_dir_from_path(result, count);
+#else
+#error oneflow_api::GetExeDir() has not been supported on windows.
+#endif
+}
+}  // namespace oneflow_api
--- a/oneflow/api/cpp/tests/api_test.h
+++ b/oneflow/api/cpp/tests/api_test.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_API_CPP_TESTS_API_TEST_H_
+#define ONEFLOW_API_CPP_TESTS_API_TEST_H_
+#include "oneflow/api/cpp/api.h"
+namespace oneflow_api {
+class EnvScope {  // NOLINT
+ public:
+  EnvScope() { initialize(); }
+  ~EnvScope() { release(); }
+};
+Shape RandomShape();
+template<typename T>
+std::vector<T> RandomData(size_t size);
+std::string GetExeDir();
+}  // namespace oneflow_api
+#endif  // !ONEFLOW_API_CPP_TESTS_API_TEST_H_
--- a/oneflow/api/cpp/tests/graph_test.cpp
+++ b/oneflow/api/cpp/tests/graph_test.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <thread>
+#include <vector>
+#include "oneflow/api/cpp/framework.h"
+#include "oneflow/api/cpp/framework/dtype.h"
+#include "oneflow/api/cpp/framework/shape.h"
+#include "oneflow/api/cpp/tests/api_test.h"
+namespace oneflow_api {
+namespace {
+inline Graph LoadGraph(const Device& device) {
+  Graph graph =
+      Graph::Load("./oneflow/api/cpp/tests/graph_test_model/affine_with_parameter", device);
+  return graph;
+}
+inline void Forward(Graph& graph, const Device& device, int expected_batch_dim = 1) {
+  std::vector<float> data(expected_batch_dim * 3);
+  std::fill(data.begin(), data.end(), 1);
+  std::vector<Tensor> inputs;
+  inputs.emplace_back(
+      Tensor::from_buffer(data.data(), Shape({expected_batch_dim, 3}), device, DType::kFloat));
+  const auto& value = graph.Forward(inputs);
+  ASSERT_TRUE(value.IsTensor());
+  Tensor output = value.ToTensor();
+  Shape shape = output.shape();
+  ASSERT_EQ(shape.At(0), expected_batch_dim);
+  ASSERT_EQ(shape.At(1), 4);
+  std::vector<float> buf(expected_batch_dim * 4);
+  output.copy_to(buf.data());
+  for (const float& element : buf) { ASSERT_EQ(element, 4); }
+}
+}  // namespace
+TEST(Api, graph_cpu_test) {
+  EnvScope scope;
+  Device device("cpu");
+  Graph graph = LoadGraph(device);
+  Forward(graph, device, 1);
+}
+#ifdef WITH_CUDA
+TEST(Api, graph_gpu_test) {
+  EnvScope scope;
+  Device device("cuda", 0);
+  Graph graph = LoadGraph(device);
+  Forward(graph, device);
+}
+TEST(Api, graph_multi_gpu_test) {
+  EnvScope scope;
+  Device device("cuda", 0);
+  Graph graph = LoadGraph(device);
+  Forward(graph, device);
+  Device device1("cuda", 1);
+  Graph graph1 = LoadGraph(device1);
+  Forward(graph1, device1);
+}
+#endif
+TEST(Api, graph_cpu_batching_test) {
+  EnvScope scope;
+  Device device("cpu");
+  Graph graph = LoadGraph(device);
+  graph.set_batch_size(10);
+  Forward(graph, device, 10);
+}
+#ifdef WITH_CUDA
+TEST(Api, graph_gpu_batching_test) {
+  EnvScope scope;
+  Device device("cuda", 0);
+  Graph graph = LoadGraph(device);
+  graph.set_batch_size(10);
+  Forward(graph, device, 10);
+}
+TEST(Api, graph_multi_device_test) {
+  EnvScope scope;
+  Device device("cuda", 0);
+  Graph graph = LoadGraph(device);
+  Forward(graph, device, 1);
+  Device device1("cuda", 1);
+  Graph graph1 = LoadGraph(device1);
+  Forward(graph1, device1, 1);
+  Device device2("cpu");
+  Graph graph2 = LoadGraph(device2);
+  Forward(graph2, device2, 1);
+}
+TEST(Api, graph_unload_test) {
+  {
+    EnvScope scope;
+    Device device("cuda", 0);
+    Graph graph = LoadGraph(device);
+    Forward(graph, device, 1);
+    {
+      Device device1("cuda", 1);
+      Graph graph1 = LoadGraph(device1);
+      Forward(graph1, device1, 1);
+    }
+    Device device2("cpu");
+    Graph graph2 = LoadGraph(device2);
+    Forward(graph2, device2, 1);
+  }
+  {
+    EnvScope scope;
+    Device device("cpu");
+    Graph graph = LoadGraph(device);
+    Forward(graph, device, 1);
+  }
+}
+#endif
+TEST(Api, graph_thread_test) {
+  EnvScope scope;
+  Device device("cpu");
+  std::vector<Graph> graphs;
+  for (int i = 0; i < 10; i++) { graphs.emplace_back(LoadGraph(device)); }
+  std::vector<std::thread> threads;
+  for (Graph& graph : graphs) {
+    threads.emplace_back(std::thread(std::bind(Forward, std::move(graph), device, 1)));
+  }
+  for (auto& thread : threads) { thread.join(); }
+}
+TEST(Api, graph_input_order_test) {
+  EnvScope scope;
+  Device device("cpu");
+  Graph graph = Graph::Load("./oneflow/api/cpp/tests/graph_test_model/affine_no_parameter", device);
+  std::vector<Tensor> inputs;
+  std::vector<float> x(3);
+  std::fill(x.begin(), x.end(), 1);
+  inputs.emplace_back(Tensor::from_buffer(x.data(), Shape({1, 3}), device, DType::kFloat));
+  std::vector<float> a(3 * 2);
+  std::fill(a.begin(), a.end(), 1);
+  inputs.emplace_back(Tensor::from_buffer(a.data(), Shape({3, 2}), device, DType::kFloat));
+  std::vector<float> b(2);
+  std::fill(b.begin(), b.end(), 1);
+  inputs.emplace_back(Tensor::from_buffer(b.data(), Shape({2}), device, DType::kFloat));
+  const auto& value = graph.Forward(inputs);
+  ASSERT_TRUE(value.IsTensor());
+  Tensor output = value.ToTensor();
+  Shape shape = output.shape();
+  ASSERT_EQ(shape.At(0), 1);
+  ASSERT_EQ(shape.At(1), 2);
+  std::array<float, 2> buf{};
+  output.copy_to(buf.data());
+  ASSERT_EQ(buf[0], 4);
+  ASSERT_EQ(buf[1], 4);
+}
+TEST(Api, graph_input_output_infos_test) {
+  EnvScope scope;
+  Device device("cpu");
+  Graph graph = LoadGraph(device);
+  auto input_infos = graph.GetInputInfos();
+  auto output_infos = graph.GetOutputInfos();
+  ASSERT_EQ(input_infos.size(), 1);
+  ASSERT_EQ(output_infos.size(), 1);
+  auto it = input_infos.begin();
+  DType dtype = it->second.datatype_;
+  Shape shape = it->second.input_output_shape_;
+  size_t order = it->second.input_output_index_;
+  ASSERT_EQ(dtype, DType::kFloat);
+  ASSERT_EQ(shape.NumAxes(), 2);
+  ASSERT_EQ(shape.At(0), 1);
+  ASSERT_EQ(shape.At(1), 3);
+  ASSERT_EQ(order, 0);
+  it = output_infos.begin();
+  dtype = it->second.datatype_;
+  shape = it->second.input_output_shape_;
+  order = it->second.input_output_index_;
+  ASSERT_EQ(dtype, DType::kFloat);
+  ASSERT_EQ(shape.NumAxes(), 2);
+  ASSERT_EQ(shape.At(0), 1);
+  ASSERT_EQ(shape.At(1), 4);
+  ASSERT_EQ(order, 0);
+}
+}  // namespace oneflow_api
--- a/oneflow/api/cpp/tests/graph_test_model/affine_no_parameter/model.mlir
+++ b/oneflow/api/cpp/tests/graph_test_model/affine_no_parameter/model.mlir
+module  {
+  oneflow.job @MyGraph_1(%arg0: tensor<1x3xf32>, %arg1: tensor<3x2xf32>, %arg2: tensor<2xf32>) -> tensor<1x2xf32> {
+    %output = "oneflow.input"(%arg0) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_1-input_0", output_lbns = ["_MyGraph_1-input_0/out"], scope_symbol_id = 4611686018427527167 : i64, shape = [1 : si64, 3 : si64]} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    %output_0 = "oneflow.input"(%arg1) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_1-input_1", output_lbns = ["_MyGraph_1-input_1/out"], scope_symbol_id = 4611686018427527167 : i64, shape = [3 : si64, 2 : si64]} : (tensor<3x2xf32>) -> tensor<3x2xf32>
+    %output_1 = "oneflow.input"(%arg2) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_1-input_2", output_lbns = ["_MyGraph_1-input_2/out"], scope_symbol_id = 4611686018427527167 : i64, shape = [2 : si64]} : (tensor<2xf32>) -> tensor<2xf32>
+    %0 = "oneflow.matmul"(%output, %output_0) {alpha = 1.000000e+00 : f64, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], op_name = "model-matmul_0", output_lbns = ["model-matmul_0/out_0"], scope_symbol_id = 4611686018427535359 : i64, transpose_a = false, transpose_b = false} : (tensor<1x3xf32>, tensor<3x2xf32>) -> tensor<1x2xf32>
+    %1 = "oneflow.broadcast_add"(%0, %output_1) {device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], op_name = "model-broadcast_add_1", output_lbns = ["model-broadcast_add_1/z_0"], scope_symbol_id = 4611686018427535359 : i64} : (tensor<1x2xf32>, tensor<2xf32>) -> tensor<1x2xf32>
+    %output_2 = "oneflow.output"(%1) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_1-output_0", output_lbns = ["_MyGraph_1-output_0/out"], scope_symbol_id = 4611686018427527167 : i64, shape = [1 : si64, 2 : si64]} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    oneflow.return %output_2 : tensor<1x2xf32>
+  }
+}
--- a/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.a/meta
+++ b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.a/meta
+shape {
+  dim: 3
+  dim: 4
+}
+data_type: kFloat
--- a/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.a/out
+++ b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.a/out
--- a/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.b/meta
+++ b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.b/meta
+shape {
+  dim: 4
+}
+data_type: kFloat
--- a/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.b/out
+++ b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.b/out
--- a/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.mlir
+++ b/oneflow/api/cpp/tests/graph_test_model/affine_with_parameter/model.mlir
+module  {
+  oneflow.job @MyGraph_0(%arg0: tensor<1x3xf32>) -> tensor<1x4xf32> {
+    %output = "oneflow.input"(%arg0) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_0-input_0", output_lbns = ["_MyGraph_0-input_0/out"], scope_symbol_id = 4611686018427469823 : i64, shape = [1 : si64, 3 : si64]} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    %output_0 = "oneflow.variable"() {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], nd_sbp = ["B"], op_name = "model.a", output_lbns = ["model.a/out"], scope_symbol_id = 4611686018427482111 : i64, shape = [3 : si64, 4 : si64]} : () -> tensor<3x4xf32>
+    %output_1 = "oneflow.variable"() {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], nd_sbp = ["B"], op_name = "model.b", output_lbns = ["model.b/out"], scope_symbol_id = 4611686018427494399 : i64, shape = [4 : si64]} : () -> tensor<4xf32>
+    %0 = "oneflow.matmul"(%output, %output_0) {alpha = 1.000000e+00 : f64, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], op_name = "model-matmul_0", output_lbns = ["model-matmul_0/out_0"], scope_symbol_id = 4611686018427486207 : i64, transpose_a = false, transpose_b = false} : (tensor<1x3xf32>, tensor<3x4xf32>) -> tensor<1x4xf32>
+    %1 = "oneflow.broadcast_add"(%0, %output_1) {device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], op_name = "model-broadcast_add_1", output_lbns = ["model-broadcast_add_1/z_0"], scope_symbol_id = 4611686018427486207 : i64} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+    %output_2 = "oneflow.output"(%1) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "cpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_MyGraph_0-output_0", output_lbns = ["_MyGraph_0-output_0/out"], scope_symbol_id = 4611686018427469823 : i64, shape = [1 : si64, 4 : si64]} : (tensor<1x4xf32>) -> tensor<1x4xf32>
+    oneflow.return %output_2 : tensor<1x4xf32>
+  }
+}
--- a/oneflow/api/cpp/tests/ivalue_test.cpp
+++ b/oneflow/api/cpp/tests/ivalue_test.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <random>
+#include <gtest/gtest.h>
+#include "oneflow/api/cpp/framework/dtype.h"
+#include "oneflow/api/cpp/framework/ivalue.h"
+#include "oneflow/api/cpp/tests/api_test.h"
+namespace oneflow_api {
+namespace {
+std::mt19937 rng(std::random_device{}());
+}
+TEST(Api, ivalue) {
+  std::uniform_real_distribution<> dist(-100, 100);
+  std::uniform_int_distribution<> dist_bool(0, 1);
+  const auto v_int = static_cast<int>(dist(rng));
+  ASSERT_EQ(IValue(v_int).ToInt(), v_int);
+  const auto v_int64 = static_cast<int64_t>(dist(rng));
+  ASSERT_EQ(IValue(v_int64).ToInt(), v_int64);
+  const auto v_float = static_cast<float>(dist(rng));
+  ASSERT_EQ(IValue(v_float).ToDouble(), v_float);
+  const auto v_double = static_cast<double>(dist(rng));
+  ASSERT_EQ(IValue(v_double).ToDouble(), v_double);
+  const auto v_bool = static_cast<bool>(dist_bool(rng));
+  ASSERT_EQ(IValue(v_bool).ToBool(), v_bool);
+}
+TEST(Api, ivalue_tensor) {
+  EnvScope scope;
+  const auto device = Device("cpu");
+  const auto shape = RandomShape();
+  const auto dtype = DType::kDouble;
+  const IValue i_tensor(Tensor(shape, device, dtype));
+  const auto& tensor = i_tensor.ToTensor();
+  ASSERT_EQ(tensor.shape(), shape);
+  ASSERT_EQ(tensor.device(), device);
+  ASSERT_EQ(tensor.dtype(), dtype);
+}
+TEST(Api, ivalue_tensor_vector) {
+  EnvScope scope;
+  const auto device = Device("cpu");
+  const std::vector<Tensor> v_tensor_vector{Tensor(RandomShape(), device, DType::kDouble),
+                                            Tensor(RandomShape(), device, DType::kFloat)};
+  const auto i_tensor = IValue(v_tensor_vector);
+  const auto& tensor_vector = i_tensor.ToTensorVector();
+  ASSERT_EQ(v_tensor_vector.size(), tensor_vector.size());
+  for (size_t i = 0; i < tensor_vector.size(); ++i) {
+    ASSERT_EQ(v_tensor_vector[i].device(), tensor_vector[i].device());
+    ASSERT_EQ(v_tensor_vector[i].shape(), tensor_vector[i].shape());
+    ASSERT_EQ(v_tensor_vector[i].dtype(), tensor_vector[i].dtype());
+  }
+}
+TEST(Api, ivalue_copy) {
+  EnvScope scope;
+  const auto device = Device("cpu");
+  const auto shape = RandomShape();
+  const auto dtype = DType::kDouble;
+  const IValue i_tensor(Tensor(shape, device, dtype));
+  const auto i_tensor_a = i_tensor;  // NOLINT
+  ASSERT_EQ(i_tensor_a.ToTensor().shape(), shape);
+  ASSERT_EQ(i_tensor_a.ToTensor().device(), device);
+  ASSERT_EQ(i_tensor_a.ToTensor().dtype(), dtype);
+  IValue i_tensor_b;
+  i_tensor_b = i_tensor;
+  ASSERT_EQ(i_tensor_b.ToTensor().shape(), shape);
+  ASSERT_EQ(i_tensor_b.ToTensor().device(), device);
+  ASSERT_EQ(i_tensor_b.ToTensor().dtype(), dtype);
+}
+TEST(Api, ivalue_move) {
+  EnvScope scope;
+  const auto device = Device("cpu");
+  const auto shape = RandomShape();
+  const auto dtype = DType::kDouble;
+  IValue i_tensor_a = IValue(Tensor(shape, device, dtype));
+  IValue i_tensor_b = IValue(Tensor(shape, device, dtype));
+  IValue i_tensor_c = std::move(i_tensor_a);
+  ASSERT_EQ(i_tensor_c.ToTensor().shape(), shape);
+  ASSERT_EQ(i_tensor_c.ToTensor().device(), device);
+  ASSERT_EQ(i_tensor_c.ToTensor().dtype(), dtype);
+  IValue i_tensor_d;
+  i_tensor_d = std::move(i_tensor_b);
+  ASSERT_EQ(i_tensor_d.ToTensor().shape(), shape);
+  ASSERT_EQ(i_tensor_d.ToTensor().device(), device);
+  ASSERT_EQ(i_tensor_d.ToTensor().dtype(), dtype);
+  ASSERT_EQ(i_tensor_a.IsNone(), true);
+  ASSERT_EQ(i_tensor_b.IsNone(), true);
+}
+}  // namespace oneflow_api
--- a/oneflow/api/cpp/tests/nn_test.cpp
+++ b/oneflow/api/cpp/tests/nn_test.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <random>
+#include <thread>
+#include <gtest/gtest.h>
+#include "oneflow/api/cpp/tests/api_test.h"
+namespace oneflow_api {
+namespace {
+std::mt19937 rng(std::random_device{}());
+template<typename T>
+std::vector<T> Relu(const std::vector<T>& data) {
+  std::vector<T> result(data.begin(), data.end());
+  T zero = static_cast<T>(0);
+  for (auto& x : result) {
+    if (x < zero) { x = zero; }
+  }
+  return result;
+}
+}  // namespace
+void TestRelu() {
+  const auto shape = RandomShape();
+  const auto data = RandomData<float>(shape.Count(0));
+  const auto target_data = Relu(data);
+  std::vector<float> result(shape.Count(0));
+  auto tensor = Tensor::from_buffer(data.data(), shape, Device("cpu"), DType::kFloat);
+  auto result_tensor = nn::relu(tensor);
+  result_tensor.copy_to(result.data());
+  ASSERT_EQ(result, target_data);
+}
+TEST(Api, nn_relu) {
+  EnvScope scope;
+  TestRelu();
+}
+TEST(Api, nn_relu_multithreading) {
+  EnvScope scope;
+  std::vector<std::thread> threads;
+  std::uniform_int_distribution<> dist(8, 32);
+  int n_threads = dist(rng);
+  for (int i = 0; i < n_threads; ++i) { threads.emplace_back(std::thread(TestRelu)); }
+  for (auto& x : threads) { x.join(); }
+}
+}  // namespace oneflow_api
--- a/oneflow/api/cpp/tests/tensor_test.cpp
+++ b/oneflow/api/cpp/tests/tensor_test.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+#include "oneflow/api/cpp/tests/api_test.h"
+namespace oneflow_api {
+TEST(Api, device) {
+  EnvScope scope;
+  auto device = Device("cpu");
+  ASSERT_EQ(device.type(), "cpu");
+#ifdef WITH_CUDA
+  device = Device("cuda:0");
+  ASSERT_EQ(device.type(), "cuda");
+  ASSERT_EQ(device.device_id(), 0);
+  device = Device("cuda", 1);
+  ASSERT_EQ(device.type(), "cuda");
+  ASSERT_EQ(device.device_id(), 1);
+#endif
+}
+TEST(Api, tensor) {
+  EnvScope scope;
+  const auto device = Device("cpu");
+  const auto shape = RandomShape();
+  const auto dtype = DType::kDouble;
+  Tensor tensor;
+  ASSERT_EQ(tensor.shape(), Shape());
+  ASSERT_EQ(tensor.device(), Device("cpu"));
+  ASSERT_EQ(tensor.dtype(), DType::kFloat);
+  Tensor tensor_with_all(shape, device, dtype);
+  ASSERT_EQ(tensor_with_all.shape(), shape);
+  ASSERT_EQ(tensor_with_all.device(), device);
+  ASSERT_EQ(tensor_with_all.dtype(), dtype);
+}
+TEST(Api, tensor_from_buffer_and_copy_to) {
+  EnvScope scope;
+  const auto shape = RandomShape();
+#define TEST_TENSOR_FROM_AND_TO_BLOB(dtype, cpp_dtype)                                           \
+  std::vector<cpp_dtype> data_##cpp_dtype(shape.Count(0)), new_data_##cpp_dtype(shape.Count(0)); \
+  for (int i = 0; i < shape.Count(0); ++i) { data_##cpp_dtype[i] = i; }                          \
+  auto tensor_##cpp_dtype =                                                                      \
+      Tensor::from_buffer(data_##cpp_dtype.data(), shape, Device("cpu"), dtype);                 \
+  tensor_##cpp_dtype.copy_to(new_data_##cpp_dtype.data());                                       \
+  ASSERT_EQ(new_data_##cpp_dtype, data_##cpp_dtype);
+  TEST_TENSOR_FROM_AND_TO_BLOB(DType::kFloat, float)
+  TEST_TENSOR_FROM_AND_TO_BLOB(DType::kDouble, double)
+  TEST_TENSOR_FROM_AND_TO_BLOB(DType::kInt8, int8_t)
+  TEST_TENSOR_FROM_AND_TO_BLOB(DType::kInt32, int32_t)
+  TEST_TENSOR_FROM_AND_TO_BLOB(DType::kInt64, int64_t)
+}
+TEST(Api, tensor_zeros) {
+  EnvScope scope;
+  const auto shape = RandomShape();
+  std::vector<float> data(shape.Count(0)), target_data(shape.Count(0));
+  Tensor tensor(shape, Device("cpu"), DType::kFloat);
+  tensor.zeros_();
+  tensor.copy_to(data.data());
+  std::fill(target_data.begin(), target_data.end(), 0);
+  ASSERT_EQ(data, target_data);
+}
+}  // namespace oneflow_api
--- a/oneflow/api/python/autograd/autograd.cpp
+++ b/oneflow/api/python/autograd/autograd.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <pybind11/pybind11.h>
+#include <memory>
+#include <vector>
+#include "oneflow/api/python/of_api_registry.h"
+#include "oneflow/core/framework/dtype.h"
+#include "oneflow/core/framework/tensor.h"
+#include "oneflow/core/framework/tensor_tuple.h"
+#include "oneflow/core/autograd/autograd_engine.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/functional/functional.h"
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/common/container_util.h"
+namespace oneflow {
+namespace autograd {
+namespace {
+bool IsScalarTensor(const one::Tensor& tensor) {
+  const auto& shape = tensor.shape();
+  return shape->elem_cnt() == 1;
+}
+// Checks and sets default value for initial gradients based on out_grads
+// If output is the tensor whose size is greater than 1, out_grad's shape must be same as output's.
+// If output is a scalar tensor, out_grad will also be a scaler or empty(will be initted to
+// `oneflow.ones([1])`).
+Maybe<one::TensorTuple> CheckAndInitOutGrads(const one::TensorTuple& outputs,
+                                             const one::TensorTuple& out_grads) {
+  size_t grad_size = out_grads.empty() ? outputs.size() : out_grads.size();
+  auto gradients = std::make_shared<one::TensorTuple>(grad_size);
+  CHECK_EQ_OR_RETURN(outputs.size(), gradients->size())
+      << "RuntimeError: got " << outputs.size() << " tensors and " << gradients->size()
+      << " gradients";
+  for (int i = 0; i < outputs.size(); ++i) {
+    CHECK_OR_RETURN(outputs.at(i)->requires_grad())
+        << "All output tensors `.requires_grad` should be true";
+    if (!outputs.at(i)->grad_fn_node()) {
+      CHECK_OR_RETURN(outputs.at(i)->is_leaf())
+          << "output[" << i << "] doesn't have grad_fn and it is not leaf tensor!\n"
+          << "It is a bug with oneflow, please submit an issue on GitHub: "
+             "https://github.com/Oneflow-Inc/oneflow/issues";
+      JUST(one::AddAccumulateFunctionNode(outputs.at(i)));
+    }
+    if (out_grads.empty()) {
+      CHECK_OR_RETURN(IsScalarTensor(*outputs.at(i)))
+          << "Grad can be implicitly created only for scalar outputs";
+      gradients->at(i) = JUST(one::functional::OnesLike(outputs.at(i)));
+    } else {
+      CHECK_OR_RETURN(*(outputs.at(i)->shape()) == *(out_grads.at(i)->shape()))
+          << "out_grad's shape must be same as output's (" << outputs.at(i)->shape()->ToString()
+          << " vs " << out_grads.at(i)->shape()->ToString() << ")";
+      // if (outputs.at(i)->dtype() != out_grads.at(i)->dtype()) {
+      if (JUST(oneflow::VectorAt(outputs, i))->dtype()
+          != JUST(oneflow::VectorAt(out_grads, i))->dtype()) {
+        JUST(oneflow::VectorAt(*gradients, i)) =
+            JUST(one::functional::Cast(out_grads[i], outputs[i]->dtype(), /*pin_memory=*/false));
+      } else {
+        JUST(oneflow::VectorAt(*gradients, i)) = out_grads[i];
+      }
+    }
+  }
+  return gradients;
+}
+}  // namespace
+Maybe<one::TensorTuple> Backward(const one::TensorTuple& outputs, const one::TensorTuple& out_grads,
+                                 bool retain_graph, bool create_graph) {
+  if (create_graph) { retain_graph = true; }
+  std::shared_ptr<one::TensorTuple> gradients = JUST(CheckAndInitOutGrads(outputs, out_grads));
+  JUST(one::GetThreadLocalAutogradEngine()->RunBackwardAndSaveGrads4LeafTensorIf(
+      outputs, *gradients, retain_graph, create_graph));
+  return std::make_shared<one::TensorTuple>(0);
+}
+Maybe<one::TensorTuple> Grad(const one::TensorTuple& outputs, const one::TensorTuple& inputs,
+                             const one::TensorTuple& out_grads, bool retain_graph,
+                             bool create_graph) {
+  if (create_graph) { retain_graph = true; }
+  if (inputs.empty()) { return Backward(outputs, out_grads, retain_graph, create_graph); }
+  CHECK_OR_RETURN(std::all_of(
+      inputs.begin(), inputs.end(),
+      [](const std::shared_ptr<one::Tensor>& tensor) { return tensor->requires_grad(); }))
+      << "All input tensors `.requires_grad` should be true";
+  std::shared_ptr<one::TensorTuple> gradients = JUST(CheckAndInitOutGrads(outputs, out_grads));
+  return one::GetThreadLocalAutogradEngine()->RunBackwardAndReturnInputsTensorGradIf(
+      outputs, inputs, *gradients, retain_graph, create_graph);
+}
+ONEFLOW_API_PYBIND11_MODULE("autograd", m) {
+  m.def("backward", &Backward);
+  m.def("grad", &Grad);
+}
+}  // namespace autograd
+}  // namespace oneflow
--- a/oneflow/api/python/autograd/autograd_engine.cpp
+++ b/oneflow/api/python/autograd/autograd_engine.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <pybind11/pybind11.h>
+#include "oneflow/api/python/of_api_registry.h"
+#include "oneflow/core/framework/dtype.h"
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/framework/global_param_grad_sync_mode.h"
+namespace py = pybind11;
+namespace oneflow {
+ONEFLOW_API_PYBIND11_MODULE("", m) {
+  py::class_<GlobalParamGradSyncMode, std::shared_ptr<GlobalParamGradSyncMode>>(
+      m, "GlobalParamGradSyncMode")
+      .def(py::init([](bool flag) { return std::make_shared<GlobalParamGradSyncMode>(flag); }));
+}
+}  // namespace oneflow