Oneflow 0.8 for DCU

21d47d0e · yuguo · 21d47d0e · 21d47d0e · 21d47d0e · 21d47d0e
Commit 21d47d0e authored Oct 24, 2022 by yuguo
20 changed files
--- a/oneflow/core/autograd/gradient_funcs/identity.cpp
+++ b/oneflow/core/autograd/gradient_funcs/identity.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+namespace oneflow {
+namespace one {
+struct IdentityCaptureState : public AutoGradCaptureState {
+  bool requires_grad;
+};
+class Identity : public OpExprGradFunction<IdentityCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
+  Maybe<void> Capture(IdentityCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override {
+    CHECK_EQ_OR_RETURN(inputs.size(), 1);  // NOLINT(maybe-need-error-msg)
+    ctx->requires_grad = inputs.at(0)->requires_grad();
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const IdentityCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
+    in_grads->resize(1);
+    if (ctx->requires_grad) { in_grads->at(0) = out_grads.at(0); }
+    return Maybe<void>::Ok();
+  }
+};
+REGISTER_OP_EXPR_GRAD_FUNCTION("identity", Identity);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/kl_div.cpp
+++ b/oneflow/core/autograd/gradient_funcs/kl_div.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/functional/functional.h"
+namespace oneflow {
+namespace one {
+struct KLDivLossCaptureState : public AutoGradCaptureState {
+  bool requires_grad = false;
+  bool log_target = false;
+};
+class KLDivLoss : public OpExprGradFunction<KLDivLossCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override;
+  Maybe<void> Capture(KLDivLossCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override;
+  Maybe<void> Apply(const KLDivLossCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override;
+ private:
+  AttrMap base_attrs_;
+};
+Maybe<void> KLDivLoss::Init(const OpExpr& op) {
+  const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+  CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+  base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+  return Maybe<void>::Ok();
+}
+Maybe<void> KLDivLoss::Capture(KLDivLossCaptureState* ctx, const TensorTuple& inputs,
+                               const TensorTuple& outputs, const AttrMap& attrs) const {
+  ctx->requires_grad = inputs.at(0)->requires_grad();
+  if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+  ComposedAttrMap composed_attrs(attrs, base_attrs_);
+  ctx->log_target = JUST(composed_attrs.GetAttr<bool>("log_target"));
+  ctx->SaveTensorForBackward(inputs.at(0));  // input
+  ctx->SaveTensorForBackward(inputs.at(1));  // target
+  return Maybe<void>::Ok();
+}
+Maybe<void> KLDivLoss::Apply(const KLDivLossCaptureState* ctx, const TensorTuple& out_grads,
+                             TensorTuple* in_grads) const {
+  if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+  CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
+  const auto& dy = out_grads.at(0);
+  const auto& input = ctx->SavedTensors().at(0);
+  const auto& target = ctx->SavedTensors().at(1);
+  in_grads->resize(ctx->SavedTensors().size());
+  in_grads->at(0) = JUST(functional::KLDivLossGrad(dy, input, target, ctx->log_target));
+  return Maybe<void>::Ok();
+}
+REGISTER_OP_EXPR_GRAD_FUNCTION("kl_div_loss", KLDivLoss);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/l2_normalize.cpp
+++ b/oneflow/core/autograd/gradient_funcs/l2_normalize.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_builder.h"
+#include "oneflow/core/framework/op_expr.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/functional/functional.h"
+namespace oneflow {
+namespace one {
+struct L2NormalizeCaptureState : public AutoGradCaptureState {
+  int64_t axis;
+  float epsilon;
+  bool requires_grad;
+};
+class L2Normalize : public OpExprGradFunction<L2NormalizeCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override;
+  Maybe<void> Capture(L2NormalizeCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override;
+  Maybe<void> Apply(const L2NormalizeCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override;
+ private:
+  AttrMap base_attrs_;
+};
+Maybe<void> L2Normalize::Init(const OpExpr& op) {
+  const UserOpExpr* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+  CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+  base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+  return Maybe<void>::Ok();
+}
+Maybe<void> L2Normalize::Capture(L2NormalizeCaptureState* ctx, const TensorTuple& inputs,
+                                 const TensorTuple& outputs, const AttrMap& attrs) const {
+  ctx->requires_grad = inputs.at(0)->requires_grad();
+  if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+  ctx->SaveTensorForBackward(outputs.at(0));  // y
+  ctx->SaveTensorForBackward(outputs.at(1));  // square_x_sum
+  ComposedAttrMap composed_attrs(attrs, base_attrs_);
+  ctx->axis = JUST(composed_attrs.GetAttr<int32_t>("axis"));
+  ctx->epsilon = JUST(composed_attrs.GetAttr<float>("epsilon"));
+  return Maybe<void>::Ok();
+}
+Maybe<void> L2Normalize::Apply(const L2NormalizeCaptureState* ctx, const TensorTuple& out_grads,
+                               TensorTuple* in_grads) const {
+  if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+  in_grads->resize(1);
+  CHECK_EQ_OR_RETURN(out_grads.size(), 2);  // NOLINT(maybe-need-error-msg)
+  const auto& y = ctx->SavedTensors().at(0);
+  const auto& square_x_sum = ctx->SavedTensors().at(1);
+  in_grads->at(0) =
+      JUST(functional::L2NormalizeGrad(out_grads.at(0), y, square_x_sum, ctx->axis, ctx->epsilon));
+  return Maybe<void>::Ok();
+}
+REGISTER_OP_EXPR_GRAD_FUNCTION("l2_normalize", L2Normalize);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/layer_norm.cpp
+++ b/oneflow/core/autograd/gradient_funcs/layer_norm.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/attr_map.h"
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/functional/functional.h"
+namespace oneflow {
+namespace one {
+struct LayerNormCaptureState : public AutoGradCaptureState {
+  bool center = true;
+  bool scale = true;
+  int64_t begin_norm_axis = 1;
+  int64_t begin_params_axis = 1;
+  double epsilon = 1e-5;
+  bool x_requires_grad = true;
+  bool has_affine = true;
+  size_t gamma_index = 0;
+  size_t x_index = 1;
+  size_t mean_index = 2;
+  size_t inv_variance_index = 3;
+};
+// y, mean, inv_variance =
+//   layer_norm(x, [gamma], [beta], center=False, scale=False, begin_norm_axis=1,
+//              begin_params_axis=-1, epsilon=1e-5)
+class LayerNorm : public OpExprGradFunction<LayerNormCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override;
+  Maybe<void> Capture(LayerNormCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override;
+  Maybe<void> Apply(const LayerNormCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override;
+ private:
+  AttrMap base_attrs_;
+  std::string op_name_;
+};
+Maybe<void> LayerNorm::Init(const OpExpr& op) {
+  const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+  CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+  base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+  op_name_ = fw_op_expr->op_name();
+  return Maybe<void>::Ok();
+}
+Maybe<void> LayerNorm::Capture(LayerNormCaptureState* ctx, const TensorTuple& inputs,
+                               const TensorTuple& outputs, const AttrMap& attrs) const {
+  ComposedAttrMap composed_attrs(attrs, base_attrs_);
+  ctx->center = JUST(composed_attrs.GetAttr<bool>("center"));
+  ctx->scale = JUST(composed_attrs.GetAttr<bool>("scale"));
+  ctx->begin_norm_axis = JUST(composed_attrs.GetAttr<int64_t>("begin_norm_axis"));
+  ctx->begin_params_axis = JUST(composed_attrs.GetAttr<int64_t>("begin_params_axis"));
+  ctx->epsilon = JUST(composed_attrs.GetAttr<double>("epsilon"));
+  CHECK_EQ_OR_RETURN(inputs.size(), ctx->center + ctx->scale + 1);  // NOLINT(maybe-need-error-msg)
+  CHECK_EQ_OR_RETURN(outputs.size(), 3);                            // NOLINT(maybe-need-error-msg)
+  bool has_gamma_diff = ctx->scale && inputs.at(1)->requires_grad();
+  bool has_beta_diff = ctx->center && inputs.at(2)->requires_grad();
+  ctx->has_affine = has_gamma_diff && has_beta_diff;
+  ctx->x_requires_grad = inputs.at(0)->requires_grad();
+  if (ctx->x_requires_grad || ctx->has_affine) {
+    ctx->x_index = ctx->SaveTensorForBackward(inputs.at(0));
+    ctx->mean_index = ctx->SaveTensorForBackward(outputs.at(1));
+    ctx->inv_variance_index = ctx->SaveTensorForBackward(outputs.at(2));
+    if (ctx->x_requires_grad && ctx->scale) {
+      ctx->gamma_index = ctx->SaveTensorForBackward(inputs.at(1));  // save gamma.
+    }
+  }
+  return Maybe<void>::Ok();
+}
+Maybe<void> LayerNorm::Apply(const LayerNormCaptureState* ctx, const TensorTuple& out_grads,
+                             TensorTuple* in_grads) const {
+  const auto& saved_tensors = ctx->SavedTensors();
+  in_grads->resize(ctx->center + ctx->scale + 1);
+  std::shared_ptr<Tensor> dy = out_grads.at(0);
+  int64_t begin_params_axis = ctx->begin_params_axis;
+  if (begin_params_axis < 0) { begin_params_axis += dy->shape()->NumAxes(); }
+  int64_t begin_norm_axis = ctx->begin_norm_axis;
+  if (begin_norm_axis < 0) { begin_norm_axis += dy->shape()->NumAxes(); }
+  std::shared_ptr<Tensor> x = saved_tensors.at(ctx->x_index);
+  std::shared_ptr<Tensor> mean = saved_tensors.at(ctx->mean_index);
+  std::shared_ptr<Tensor> inv_variance = saved_tensors.at(ctx->inv_variance_index);
+  if (ctx->has_affine) {
+    // Use LayerNormParamGrad(Tensor dy, Tensor x, Tensor mean, Tensor inv_variance, Int64
+    // begin_params_axis, Double epsilon).
+    const auto& results = JUST(
+        functional::LayerNormParamGrad(dy, x, mean, inv_variance, begin_params_axis, ctx->epsilon));
+    in_grads->at(1) = results->at(0);  // For gamma.
+    in_grads->at(2) = results->at(1);  // For beta.
+  }
+  if (ctx->x_requires_grad) {
+    if (ctx->scale) {
+      std::shared_ptr<Tensor> gamma = saved_tensors.at(ctx->gamma_index);
+      in_grads->at(0) = JUST(functional::LayerNormAffineGrad(dy, x, mean, inv_variance, gamma,
+                                                             begin_norm_axis, ctx->epsilon));
+    } else {
+      in_grads->at(0) =
+          JUST(functional::LayerNormGrad(dy, x, mean, inv_variance, begin_norm_axis, ctx->epsilon));
+    }
+  }
+  return Maybe<void>::Ok();
+}
+REGISTER_OP_EXPR_GRAD_FUNCTION("layer_norm", LayerNorm);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/log_softmax.cpp
+++ b/oneflow/core/autograd/gradient_funcs/log_softmax.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_builder.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/framework/op_expr.h"
+namespace oneflow {
+namespace one {
+struct LogSoftmaxCaptureState : public AutoGradCaptureState {
+  bool requires_grad;
+};
+class LogSoftmax : public OpExprGradFunction<LogSoftmaxCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override;
+  Maybe<void> Capture(LogSoftmaxCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override;
+  Maybe<void> Apply(const LogSoftmaxCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override;
+ private:
+  AttrMap base_attrs_;
+  std::shared_ptr<OpExpr> grad_op_;
+};
+Maybe<void> LogSoftmax::Init(const OpExpr& op) {
+  const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+  CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+  const std::string& op_name = fw_op_expr->op_name();
+  base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+  grad_op_ = JUST(one::OpBuilder("log_softmax_grad", GradientOpName(op_name))
+                      .Input("prob")
+                      .Input("dy")
+                      .Output("dx")
+                      .Build());
+  return Maybe<void>::Ok();
+}
+Maybe<void> LogSoftmax::Capture(LogSoftmaxCaptureState* ctx, const TensorTuple& inputs,
+                                const TensorTuple& outputs, const AttrMap& attrs) const {
+  ComposedAttrMap composed_attrs(attrs, base_attrs_);
+  CHECK_EQ_OR_RETURN(inputs.size(), 1);  // NOLINT(maybe-need-error-msg)
+  ctx->requires_grad = inputs.at(0)->requires_grad();
+  if (!ctx->requires_grad) return Maybe<void>::Ok();
+  ctx->SaveTensorForBackward(outputs.at(0));
+  return Maybe<void>::Ok();
+}
+Maybe<void> LogSoftmax::Apply(const LogSoftmaxCaptureState* ctx, const TensorTuple& out_grads,
+                              TensorTuple* in_grads) const {
+  if (!ctx->requires_grad) return Maybe<void>::Ok();
+  CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
+  const auto& dy = out_grads.at(0);
+  const auto& prob = ctx->SavedTensors().at(0);
+  in_grads->resize(1);
+  in_grads->at(0) = JUST(OpInterpUtil::Dispatch<Tensor>(*grad_op_, {prob, dy}));
+  return Maybe<void>::Ok();
+}
+REGISTER_OP_EXPR_GRAD_FUNCTION("log_softmax", LogSoftmax);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/masked_fill.cpp
+++ b/oneflow/core/autograd/gradient_funcs/masked_fill.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_builder.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/functional/functional.h"
+namespace oneflow {
+namespace one {
+struct MaskedFillCaptureState : public AutoGradCaptureState {
+  bool requires_grad = true;
+};
+class MaskedFill : public OpExprGradFunction<MaskedFillCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
+  Maybe<void> Capture(MaskedFillCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override {
+    ctx->requires_grad = inputs.at(0)->requires_grad();
+    if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+    ctx->SaveTensorForBackward(inputs.at(0));
+    ctx->SaveTensorForBackward(inputs.at(1));
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const MaskedFillCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+    CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
+    const std::shared_ptr<oneflow::one::Tensor>& x = ctx->SavedTensors().at(0);
+    const std::shared_ptr<oneflow::one::Tensor>& mask = ctx->SavedTensors().at(1);
+    std::shared_ptr<oneflow::one::Tensor> zero_out = JUST(functional::ZerosLike(x));
+    in_grads->resize(2);
+    in_grads->at(0) = JUST(functional::Where(mask, zero_out, out_grads.at(0)));
+    return Maybe<void>::Ok();
+  }
+};
+REGISTER_OP_EXPR_GRAD_FUNCTION("masked_fill", MaskedFill);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/math_binary_op.cpp
+++ b/oneflow/core/autograd/gradient_funcs/math_binary_op.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/functional/functional.h"
+#include "oneflow/user/ops/math_binary_elementwise_seq.h"
+namespace oneflow {
+namespace one {
+struct BinaryMathCaptureState : public AutoGradCaptureState {
+  bool x_requires_grad;
+  bool y_requires_grad;
+};
+typedef Maybe<one::Tensor> (*BinaryBwFunc)(const std::shared_ptr<one::Tensor>&,
+                                           const std::shared_ptr<one::Tensor>&,
+                                           const std::shared_ptr<one::Tensor>&);
+template<BinaryBwFunc BwXFunc, BinaryBwFunc BwYFunc>
+class BinaryMathOp : public OpExprGradFunction<BinaryMathCaptureState> {
+  Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
+  Maybe<void> Capture(BinaryMathCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override {
+    ctx->x_requires_grad = inputs.at(0)->requires_grad();
+    ctx->y_requires_grad = inputs.at(1)->requires_grad();
+    ctx->SaveTensorForBackward(inputs.at(0));
+    ctx->SaveTensorForBackward(inputs.at(1));
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const BinaryMathCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    if (!(ctx->x_requires_grad || ctx->y_requires_grad)) { return Maybe<void>::Ok(); }
+    in_grads->resize(2);
+    const std::shared_ptr<one::Tensor>& x = ctx->SavedTensors().at(0);
+    const std::shared_ptr<one::Tensor>& y = ctx->SavedTensors().at(1);
+    if (ctx->x_requires_grad) { in_grads->at(0) = JUST(BwXFunc(x, y, out_grads.at(0))); }
+    if (ctx->y_requires_grad) { in_grads->at(1) = JUST(BwYFunc(x, y, out_grads.at(0))); }
+    return Maybe<void>::Ok();
+  }
+};
+#define INSTANTIAT_AND_REGISTER_BINARY_MATHOP_CLASS(op_type_name, op_cls)             \
+  class op_cls##Cls final                                                             \
+      : public BinaryMathOp<functional::op_cls##XGrad, functional::op_cls##YGrad> {}; \
+  REGISTER_OP_EXPR_GRAD_FUNCTION(op_type_name, op_cls##Cls);
+OF_PP_FOR_EACH_TUPLE(INSTANTIAT_AND_REGISTER_BINARY_MATHOP_CLASS, MATH_BINARY_ELEMENTWISE_FUNC_SEQ);
+#undef INSTANTIAT_AND_REGISTER_BINARY_MATHOP_CLASS
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/math_unary_op.cpp
+++ b/oneflow/core/autograd/gradient_funcs/math_unary_op.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/user/ops/math_unary_elementwise_seq.h"
+#include "oneflow/core/functional/functional.h"
+namespace oneflow {
+namespace one {
+struct UnaryMathCaptureState : public AutoGradCaptureState {
+  bool x_requires_grad;
+};
+typedef Maybe<one::Tensor> (*UnaryBwFunc)(const std::shared_ptr<one::Tensor>&,
+                                          const std::shared_ptr<one::Tensor>&);
+template<UnaryBwFunc BwFunc>
+class UnaryMathOp : public OpExprGradFunction<UnaryMathCaptureState> {
+  Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
+  Maybe<void> Capture(UnaryMathCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override {
+    ctx->x_requires_grad = inputs.at(0)->requires_grad();
+    ctx->SaveTensorForBackward(inputs.at(0));
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const UnaryMathCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    if (!ctx->x_requires_grad) { return Maybe<void>::Ok(); }
+    const auto& x = ctx->SavedTensors().at(0);
+    in_grads->at(0) = JUST(BwFunc(x, out_grads.at(0)));
+    return Maybe<void>::Ok();
+  }
+ protected:
+  std::shared_ptr<OpExpr> grad_op_;
+};
+#define INSTANTIAT_AND_REGISTER_UNARY_MATHOP_CLASS(op_type_name, op_cls)     \
+  class op_cls##Cls final : public UnaryMathOp<functional::op_cls##Grad> {}; \
+  REGISTER_OP_EXPR_GRAD_FUNCTION(op_type_name, op_cls##Cls);
+OF_PP_FOR_EACH_TUPLE(INSTANTIAT_AND_REGISTER_UNARY_MATHOP_CLASS, MATH_UNARY_ELEMENTWISE_FUNC_SEQ);
+OF_PP_FOR_EACH_TUPLE(INSTANTIAT_AND_REGISTER_UNARY_MATHOP_CLASS,
+                     OF_PP_MAKE_TUPLE_SEQ("tanh", Tanh));
+// higher order derivative
+OF_PP_FOR_EACH_TUPLE(INSTANTIAT_AND_REGISTER_UNARY_MATHOP_CLASS,
+                     OF_PP_MAKE_TUPLE_SEQ("sin_grad", SinGrad));
+OF_PP_FOR_EACH_TUPLE(INSTANTIAT_AND_REGISTER_UNARY_MATHOP_CLASS,
+                     OF_PP_MAKE_TUPLE_SEQ("cos_grad", CosGrad));
+#undef INSTANTIAT_AND_REGISTER_UNARY_MATHOP_CLASS
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/matmul.cpp
+++ b/oneflow/core/autograd/gradient_funcs/matmul.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_builder.h"
+#include "oneflow/core/framework/op_expr.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/functional/functional.h"
+namespace oneflow {
+namespace one {
+struct MatmulCaptureState : public AutoGradCaptureState {
+  bool transpose_a;
+  bool transpose_b;
+  double alpha;
+  bool requires_grad_a;
+  bool requires_grad_b;
+  size_t a_index;
+  size_t b_index;
+};
+class Matmul : public OpExprGradFunction<MatmulCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override;
+  Maybe<void> Capture(MatmulCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override;
+  Maybe<void> Apply(const MatmulCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override;
+ protected:
+  AttrMap base_attrs_;
+};
+Maybe<void> Matmul::Init(const OpExpr& op) {
+  const UserOpExpr* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+  CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+  base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+  return Maybe<void>::Ok();
+}
+Maybe<void> Matmul::Capture(MatmulCaptureState* ctx, const TensorTuple& inputs,
+                            const TensorTuple& outputs, const AttrMap& attrs) const {
+  ctx->requires_grad_a = inputs.at(0)->requires_grad();
+  ctx->requires_grad_b = inputs.at(1)->requires_grad();
+  if (!ctx->requires_grad_a && !ctx->requires_grad_b) { return Maybe<void>::Ok(); }
+  ComposedAttrMap composed_attrs(attrs, base_attrs_);
+  ctx->transpose_a = JUST(composed_attrs.GetAttr<bool>("transpose_a"));
+  ctx->transpose_b = JUST(composed_attrs.GetAttr<bool>("transpose_b"));
+  ctx->alpha = JUST(composed_attrs.GetAttr<double>("alpha"));
+  if (ctx->requires_grad_a) {
+    ctx->b_index = ctx->SaveTensorForBackward(inputs.at(1));  // input b
+  }
+  if (ctx->requires_grad_b) {
+    ctx->a_index = ctx->SaveTensorForBackward(inputs.at(0));  // input a
+  }
+  return Maybe<void>::Ok();
+}
+Maybe<void> Matmul::Apply(const MatmulCaptureState* ctx, const TensorTuple& out_grads,
+                          TensorTuple* in_grads) const {
+  if (!ctx->requires_grad_a && !ctx->requires_grad_b) { return Maybe<void>::Ok(); }
+  CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
+  in_grads->resize(2);
+  if (ctx->requires_grad_a) {
+    const auto& input_b = ctx->SavedTensors().at(ctx->b_index);
+    if (ctx->transpose_a) {
+      in_grads->at(0) =
+          JUST(functional::MatMul(input_b, out_grads.at(0), ctx->transpose_b, true, ctx->alpha));
+    } else {
+      in_grads->at(0) = JUST(
+          functional::MatMul(out_grads.at(0), input_b, false, !(ctx->transpose_b), ctx->alpha));
+    }
+  }
+  if (ctx->requires_grad_b) {
+    const auto& input_a = ctx->SavedTensors().at(ctx->a_index);
+    if (ctx->transpose_b) {
+      in_grads->at(1) =
+          JUST(functional::MatMul(out_grads.at(0), input_a, true, ctx->transpose_a, ctx->alpha));
+    } else {
+      in_grads->at(1) = JUST(
+          functional::MatMul(input_a, out_grads.at(0), !(ctx->transpose_a), false, ctx->alpha));
+    }
+  }
+  return Maybe<void>::Ok();
+}
+class BroadcastMatmul : public Matmul {
+ public:
+  Maybe<void> Apply(const MatmulCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override;
+};
+Maybe<void> BroadcastMatmul::Apply(const MatmulCaptureState* ctx, const TensorTuple& out_grads,
+                                   TensorTuple* in_grads) const {
+  if (!ctx->requires_grad_a && !ctx->requires_grad_b) { return Maybe<void>::Ok(); }
+  CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
+  in_grads->resize(2);
+  if (ctx->requires_grad_a) {
+    const auto& input_b = ctx->SavedTensors().at(ctx->b_index);
+    if (ctx->transpose_a) {
+      in_grads->at(0) =
+          JUST(functional::MatMul(input_b, out_grads.at(0), ctx->transpose_b, true, ctx->alpha));
+    } else {
+      in_grads->at(0) = JUST(
+          functional::MatMul(out_grads.at(0), input_b, false, !(ctx->transpose_b), ctx->alpha));
+    }
+  }
+  if (ctx->requires_grad_b) {
+    const auto& input_a = ctx->SavedTensors().at(ctx->a_index);
+    if (ctx->transpose_b) {
+      in_grads->at(1) =
+          JUST(functional::BroadcastMatmulGradB(out_grads.at(0), input_a, ctx->alpha));
+    } else {
+      in_grads->at(1) =
+          JUST(functional::BroadcastMatmulGradB(input_a, out_grads.at(0), ctx->alpha));
+    }
+  }
+  return Maybe<void>::Ok();
+}
+REGISTER_OP_EXPR_GRAD_FUNCTION("matmul", Matmul);
+REGISTER_OP_EXPR_GRAD_FUNCTION("batch_matmul", Matmul);
+REGISTER_OP_EXPR_GRAD_FUNCTION("broadcast_matmul", BroadcastMatmul);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/max_pool.cpp
+++ b/oneflow/core/autograd/gradient_funcs/max_pool.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/attr_map.h"
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_builder.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/framework/op_expr.h"
+#include "oneflow/core/functional/functional.h"
+namespace oneflow {
+namespace one {
+namespace {
+struct MaxPoolCaptureState : public AutoGradCaptureState {
+  bool requires_grad = false;
+  size_t input_index = 0;
+  size_t indice_index = 0;
+  std::string data_format;
+  std::vector<int32_t> padding;
+  std::vector<int32_t> kernel_size;
+  std::vector<int32_t> stride;
+  std::vector<int32_t> dilation;
+  bool return_indices = false;
+  bool ceil_mode = false;
+};
+class MaxPoolNdGrad : public OpExprGradFunction<MaxPoolCaptureState> {
+ public:
+  virtual ~MaxPoolNdGrad() = default;
+  using OpExprGradFunction<MaxPoolCaptureState>::Init;
+  Maybe<void> Init(const OpExpr& op) override;
+  Maybe<void> Capture(MaxPoolCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override;
+  Maybe<void> Apply(const MaxPoolCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override;
+ private:
+  AttrMap base_attrs_;
+};
+Maybe<void> MaxPoolNdGrad::Init(const OpExpr& op) {
+  const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+  CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+  base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+  return Maybe<void>::Ok();
+}
+Maybe<void> MaxPoolNdGrad::Capture(MaxPoolCaptureState* ctx, const TensorTuple& inputs,
+                                   const TensorTuple& outputs, const AttrMap& attrs) const {
+  ctx->requires_grad = inputs.at(0)->requires_grad();
+  if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+  ctx->input_index = ctx->SaveTensorForBackward(inputs.at(0));
+  ctx->indice_index = ctx->SaveTensorForBackward(outputs.at(1));
+  ComposedAttrMap composed_attrs(attrs, base_attrs_);
+  ctx->data_format = JUST(composed_attrs.GetAttr<std::string>("data_format"));
+  ctx->padding = JUST(composed_attrs.GetAttr<std::vector<int32_t>>("padding"));
+  ctx->kernel_size = JUST(composed_attrs.GetAttr<std::vector<int32_t>>("kernel_size"));
+  ctx->stride = JUST(composed_attrs.GetAttr<std::vector<int32_t>>("stride"));
+  ctx->dilation = JUST(composed_attrs.GetAttr<std::vector<int32_t>>("dilation"));
+  ctx->return_indices = JUST(composed_attrs.GetAttr<bool>("return_indices"));
+  ctx->ceil_mode = JUST(composed_attrs.GetAttr<bool>("ceil_mode"));
+  return Maybe<void>::Ok();
+}
+Maybe<void> MaxPoolNdGrad::Apply(const MaxPoolCaptureState* ctx, const TensorTuple& out_grads,
+                                 TensorTuple* in_grads) const {
+  if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+  CHECK_LE_OR_RETURN(out_grads.size(), 2);  // NOLINT(maybe-need-error-msg)
+  int32_t ndims = ctx->kernel_size.size();
+  const auto& input = ctx->SavedTensors().at(ctx->input_index);
+  const auto& indice = ctx->SavedTensors().at(ctx->indice_index);
+  in_grads->resize(1);
+  (*in_grads)[0] = JUST(functional::MaxPoolNdGrad(
+      input, indice, out_grads[0], ndims, ctx->data_format, ctx->padding, ctx->kernel_size,
+      ctx->stride, ctx->dilation, ctx->return_indices, ctx->ceil_mode));
+  return Maybe<void>::Ok();
+}
+}  // namespace
+REGISTER_OP_EXPR_GRAD_FUNCTION("max_pool_1d", MaxPoolNdGrad);
+REGISTER_OP_EXPR_GRAD_FUNCTION("max_pool_2d", MaxPoolNdGrad);
+REGISTER_OP_EXPR_GRAD_FUNCTION("max_pool_3d", MaxPoolNdGrad);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/median.cpp
+++ b/oneflow/core/autograd/gradient_funcs/median.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/attr_map.h"
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_builder.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/functional/functional.h"
+#include "oneflow/core/functional/sequence_function.h"
+#include "oneflow/core/common/container_util.h"
+namespace oneflow {
+namespace one {
+struct MedianCaptureState : public AutoGradCaptureState {
+  bool requires_grad = false;
+};
+class Median : public OpExprGradFunction<MedianCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
+  Maybe<void> Capture(MedianCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override {
+    ctx->requires_grad = JUST(VectorAt(inputs, 0))->requires_grad();
+    if (ctx->requires_grad) {
+      ctx->SaveTensorForBackward(JUST(VectorAt(inputs, 0)));
+      ctx->SaveTensorForBackward(JUST(VectorAt(outputs, 0)));
+    }
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const MedianCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    if (ctx->requires_grad) {
+      const auto& input = JUST(VectorAt(ctx->SavedTensors(), 0));
+      const auto& output = JUST(VectorAt(ctx->SavedTensors(), 1));
+      const auto& dy = JUST(VectorAt(out_grads, 0));
+      std::vector<int32_t> axis(input->ndim());
+      std::iota(axis.begin(), axis.end(), 0);
+      const auto cast_like =
+          JUST(functional::SequenceFunction<Maybe<Tensor>()>(
+                   [&]() { return functional::BroadcastLike(output, input, axis); })
+                   .then(std::bind(functional::BroadcastEqual, input, std::placeholders::_1))
+                   .then(std::bind(functional::CastLike, std::placeholders::_1, input))
+                   .call());
+      const auto bcast_like_div =
+          JUST(functional::SequenceFunction<Maybe<Tensor>()>(
+                   [&]() { return functional::ReduceSum(cast_like, axis, false); })
+                   .then(std::bind(functional::Div, dy, std::placeholders::_1))
+                   .then(std::bind(functional::BroadcastLike, std::placeholders::_1, input, axis))
+                   .call());
+      in_grads->resize(1);
+      JUST(VectorAt(*in_grads, 0)) = JUST(functional::Mul(bcast_like_div, cast_like));
+    }
+    return Maybe<void>::Ok();
+  }
+};
+struct MedianWithIndicesCaptureState : public AutoGradCaptureState {
+  bool requires_grad = false;
+};
+class MedianWithIndices : public OpExprGradFunction<MedianWithIndicesCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
+  Maybe<void> Capture(MedianWithIndicesCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override {
+    ctx->requires_grad = JUST(VectorAt(inputs, 0))->requires_grad();
+    if (ctx->requires_grad) {
+      ctx->SaveTensorForBackward(JUST(VectorAt(inputs, 0)));
+      ctx->SaveTensorForBackward(JUST(VectorAt(outputs, 1)));
+    }
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const MedianWithIndicesCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    if (ctx->requires_grad) {
+      in_grads->resize(1);
+      const auto& input = JUST(VectorAt(ctx->SavedTensors(), 0));
+      const auto& indices = JUST(functional::Unsqueeze(JUST(VectorAt(ctx->SavedTensors(), 1)), -1));
+      const auto& dout = JUST(functional::Unsqueeze(JUST(VectorAt(out_grads, 0)), -1));
+      JUST(VectorAt(*in_grads, 0)) = JUST(
+          functional::DimScatter(JUST(functional::Constant(*(input->shape()), Scalar(0),
+                                                           *dout->dtype(), JUST(dout->device()))),
+                                 -1, indices, dout));
+    }
+    return Maybe<void>::Ok();
+  }
+};
+REGISTER_OP_EXPR_GRAD_FUNCTION("median", Median);
+REGISTER_OP_EXPR_GRAD_FUNCTION("median_with_indices", MedianWithIndices);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/narrow.cpp
+++ b/oneflow/core/autograd/gradient_funcs/narrow.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_builder.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/functional/functional.h"
+#include "oneflow/core/job/lazy_mode.h"
+#include "oneflow/core/framework/nd_sbp.h"
+namespace oneflow {
+namespace one {
+struct NarrowCaptureState : public AutoGradCaptureState {
+  bool requires_grad;
+  Shape shape;
+  int64_t dim;
+  int64_t start;
+  int64_t length;
+};
+class Narrow : public OpExprGradFunction<NarrowCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override {
+    const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+    CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+    base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Capture(NarrowCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override {
+    CHECK_EQ_OR_RETURN(inputs.size(), 1);   // NOLINT(maybe-need-error-msg)
+    CHECK_EQ_OR_RETURN(outputs.size(), 1);  // NOLINT(maybe-need-error-msg)
+    ctx->requires_grad = inputs.at(0)->requires_grad();
+    if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+    ComposedAttrMap composed_attrs(attrs, base_attrs_);
+    ctx->dim = JUST(composed_attrs.GetAttr<int64_t>("dim"));
+    ctx->start = JUST(composed_attrs.GetAttr<int64_t>("start"));
+    ctx->length = JUST(composed_attrs.GetAttr<int64_t>("length"));
+    if (LazyMode::is_enabled()) {
+      ctx->SaveTensorForBackward(inputs.at(0));
+    } else {
+      ctx->shape = *(inputs.at(0)->shape());
+    }
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const NarrowCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    const auto& dy = out_grads.at(0);
+    if (ctx->requires_grad) {
+      std::shared_ptr<Tensor> like;
+      if (LazyMode::is_enabled()) {
+        like = ctx->SavedTensors().at(0);
+      } else if (dy->is_local()) {
+        like = JUST(
+            functional::Empty(ctx->shape, dy->dtype(), JUST(dy->device()), /*pin_memory=*/false));
+      } else {
+        like = JUST(
+            functional::ConsistentEmpty(ctx->shape, dy->dtype(), JUST(dy->parallel_desc()),
+                                        *JUST(private_details::RawGetSbpList(JUST(dy->nd_sbp())))));
+      }
+      in_grads->resize(1);
+      in_grads->at(0) = JUST(functional::NarrowGrad(dy, like, ctx->dim, ctx->start, ctx->length));
+    }
+    return Maybe<void>::Ok();
+  }
+ private:
+  AttrMap base_attrs_;
+};
+REGISTER_OP_EXPR_GRAD_FUNCTION("narrow", Narrow);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/nll.cpp
+++ b/oneflow/core/autograd/gradient_funcs/nll.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/functional/functional.h"
+#include "oneflow/core/common/container_util.h"
+namespace oneflow {
+namespace one {
+struct NLLCaptureState : public AutoGradCaptureState {
+  bool requires_grad = false;
+  int64_t ignore_index = -100;
+};
+class NLLGradFunction : public OpExprGradFunction<NLLCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override;
+  Maybe<void> Capture(NLLCaptureState* ctx, const TensorTuple& inputs, const TensorTuple& outputs,
+                      const AttrMap& attrs) const override;
+  Maybe<void> Apply(const NLLCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override;
+ private:
+  AttrMap base_attrs_;
+};
+Maybe<void> NLLGradFunction::Init(const OpExpr& op) {
+  const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+  CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+  base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+  return Maybe<void>::Ok();
+}
+Maybe<void> NLLGradFunction::Capture(NLLCaptureState* ctx, const TensorTuple& inputs,
+                                     const TensorTuple& outputs, const AttrMap& attrs) const {
+  auto input = JUST(VectorAt(inputs, 0));
+  ctx->requires_grad = input->requires_grad();
+  if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+  ComposedAttrMap composed_attrs(attrs, base_attrs_);
+  ctx->ignore_index = JUST(composed_attrs.GetAttr<int64_t>("ignore_index"));
+  ctx->SaveTensorForBackward(input);                      // input
+  ctx->SaveTensorForBackward(JUST(VectorAt(inputs, 1)));  // target
+  if (inputs.size() == 3) {
+    ctx->SaveTensorForBackward(inputs[2]);  // weight
+  }
+  return Maybe<void>::Ok();
+}
+Maybe<void> NLLGradFunction::Apply(const NLLCaptureState* ctx, const TensorTuple& out_grads,
+                                   TensorTuple* in_grads) const {
+  if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+  CHECK_EQ_OR_RETURN(out_grads.size(), 2);  // NOLINT(maybe-need-error-msg)
+  CHECK_GE_OR_RETURN(ctx->SavedTensors().size(), 2)
+      << Error::RuntimeError()
+      << "The number of saved tensors is expected to be greater than or equal to 2, but got "
+      << ctx->SavedTensors().size();
+  const auto& out_grad = out_grads[0];
+  const auto& input = ctx->SavedTensors()[0];
+  const auto& target = ctx->SavedTensors()[1];
+  in_grads->resize(ctx->SavedTensors().size());
+  if (ctx->SavedTensors().size() == 2) {
+    JUST(VectorAt(*in_grads, 0)) =
+        JUST(functional::NLLGrad(out_grad, input, target, NullOpt, ctx->ignore_index));
+  } else {
+    // has weight
+    auto weight = JUST(VectorAt(ctx->SavedTensors(), 2));
+    JUST(VectorAt(*in_grads, 0)) =
+        JUST(functional::NLLGrad(out_grad, input, target, weight, ctx->ignore_index));
+  }
+  return Maybe<void>::Ok();
+}
+REGISTER_OP_EXPR_GRAD_FUNCTION("nll", NLLGradFunction);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/normalization.cpp
+++ b/oneflow/core/autograd/gradient_funcs/normalization.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/attr_map.h"
+#include "oneflow/core/framework/dtype.h"
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/functional/functional.h"
+namespace oneflow {
+namespace one {
+struct NormalizationGradCaptureState : public AutoGradCaptureState {
+  int32_t axis;
+  float epsilon;
+  bool track_running_stats;
+  bool is_training;
+  bool x_requires_grad;
+  bool gamma_requires_grad;
+  bool beta_requires_grad;
+};
+// training:
+// y, mean, inv_variance = normalization(x, moving_mean, moving_variance, gamma, beta,
+// axis=1, epsilon=0.01, momentum=0.9)
+// y, mean, inv_variance = normalization(x, gamma, beta, axis=1, epsilon=0.01, momentum=0.9)
+// inference:
+// y = normalization(x, moving_mean, moving_variance, gamma, beta, axis=1, epsilon=0.01,
+// momentum=0.9)
+class NormalizationGrad : public OpExprGradFunction<NormalizationGradCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override {
+    const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+    CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+    base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Capture(NormalizationGradCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override {
+    // input_size may be 3 or 5, as inputs may be
+    // (x, gamma, beta) or (x, moving_mean, moving_variance, gamma, beta)
+    // ref to track_running_stats false/true
+    // output_size may be 1 or 3, as outputs may be
+    // (x, ) or (x, mean, inv_variance)
+    // ref to is_training false/true
+    ctx->x_requires_grad = inputs.at(0)->requires_grad();
+    std::shared_ptr<Tensor> gamma, beta;
+    if (inputs.size() == 3) {
+      gamma = inputs.at(1);
+      beta = inputs.at(2);
+      ctx->track_running_stats = false;
+    } else {
+      CHECK_EQ_OR_RETURN(inputs.size(), 5);  // NOLINT(maybe-need-error-msg)
+      gamma = inputs.at(3);
+      beta = inputs.at(4);
+      ctx->track_running_stats = true;
+    }
+    ctx->gamma_requires_grad = gamma->requires_grad();
+    ctx->beta_requires_grad = beta->requires_grad();
+    ComposedAttrMap composed_attrs(attrs, base_attrs_);
+    ctx->axis = JUST(composed_attrs.GetAttr<int32_t>("axis"));
+    ctx->epsilon = JUST(composed_attrs.GetAttr<float>("epsilon"));
+    ctx->is_training = JUST(composed_attrs.GetAttr<bool>("training"));
+    ctx->SaveTensorForBackward(inputs.at(0));  // x
+    ctx->SaveTensorForBackward(gamma);         // gamma
+    if (ctx->is_training || !ctx->track_running_stats) {
+      ctx->SaveTensorForBackward(outputs.at(1));  // mean
+      ctx->SaveTensorForBackward(outputs.at(2));  // inv_variance
+    } else {
+      ctx->SaveTensorForBackward(inputs.at(1));  // moving_mean
+      ctx->SaveTensorForBackward(inputs.at(2));  // moving_variance
+    }
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const NormalizationGradCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    const auto& x = ctx->SavedTensors().at(0);      // x
+    const auto& gamma = ctx->SavedTensors().at(1);  // gamma
+    const auto& y_grad = out_grads.at(0);
+    std::shared_ptr<Tensor> mean, inv_variance;
+    if (ctx->is_training || !ctx->track_running_stats) {
+      mean = ctx->SavedTensors().at(2);          // mean
+      inv_variance = ctx->SavedTensors().at(3);  // inv_variance
+    } else {
+      const auto& moving_mean = ctx->SavedTensors().at(2);      // moving_mean
+      const auto& moving_variance = ctx->SavedTensors().at(3);  // moving_variance
+      const auto& add_eps = JUST(
+          functional::ScalarAdd(moving_variance, ctx->epsilon, /*alpha=*/1, /*inplace=*/false));
+      mean = moving_mean;
+      inv_variance = JUST(functional::Rsqrt(add_eps));
+    }
+    const auto& results = JUST(functional::NormalizationGrad(y_grad, x, mean, inv_variance, gamma,
+                                                             ctx->epsilon, ctx->axis));
+    CHECK_EQ_OR_RETURN(results->size(), 3)
+        << Error::RuntimeError() << "The number of results is expected to be 3, but got "
+        << results->size();
+    if (ctx->track_running_stats) {
+      // The normalization op has 5 inputs which are x, moving_mean, moving_variance, gamma and
+      // beta.
+      in_grads->resize(5);
+      if (ctx->gamma_requires_grad) {
+        in_grads->at(3) = results->at(1);  // gamma_diff;
+      }
+      if (ctx->beta_requires_grad) {
+        in_grads->at(4) = results->at(2);  // beta_diff
+      }
+    } else {
+      // The normalization op has 3 inputs which are x, gamma and beta.
+      in_grads->resize(3);
+      if (ctx->gamma_requires_grad) {
+        in_grads->at(1) = results->at(1);  // gamma_diff;
+      }
+      if (ctx->beta_requires_grad) {
+        in_grads->at(2) = results->at(2);  // beta_diff
+      }
+    }
+    if (!ctx->x_requires_grad) { return Maybe<void>::Ok(); }
+    if (ctx->is_training) {
+      in_grads->at(0) = results->at(0);
+      return Maybe<void>::Ok();
+    }
+    Shape shape;
+    for (int i = 0; i < x->shape()->NumAxes(); ++i) {
+      if (i != ctx->axis) {
+        shape.emplace_back(1);
+      } else {
+        shape.emplace_back(x->shape()->At(ctx->axis));
+      }
+    }
+    const auto& reshaped_gamma = JUST(functional::Reshape(gamma, shape));
+    const auto& reshaped_inv_variance = JUST(functional::Reshape(inv_variance, shape));
+    std::shared_ptr<Tensor> y_grad_fp32 = y_grad;
+    bool is_fp16 = y_grad->dtype()->data_type() == DataType::kFloat16;
+    if (is_fp16) {
+      y_grad_fp32 = JUST(functional::Cast(y_grad, DType::Float(), /*pin_memory=*/false));
+    }
+    const auto& dy_mul_gamma = JUST(functional::Mul(reshaped_gamma, y_grad_fp32));
+    const auto& dy_mul_inv_var = JUST(functional::Mul(dy_mul_gamma, reshaped_inv_variance));
+    if (is_fp16) {
+      (*in_grads)[0] =
+          JUST(functional::Cast(dy_mul_inv_var, DType::Float16(), /*pin_memory=*/false));
+    } else {
+      (*in_grads)[0] = dy_mul_inv_var;
+    }
+    return Maybe<void>::Ok();
+  }
+ private:
+  AttrMap base_attrs_;
+};
+REGISTER_OP_EXPR_GRAD_FUNCTION("normalization", NormalizationGrad);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/normalization_add_relu.cpp
+++ b/oneflow/core/autograd/gradient_funcs/normalization_add_relu.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/attr_map.h"
+#include "oneflow/core/framework/dtype.h"
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/functional/functional.h"
+namespace oneflow {
+namespace one {
+struct NormalizationAddReluGradCaptureState : public AutoGradCaptureState {
+  int32_t axis = 1;
+  float epsilon = 1e-5;
+  bool track_running_stats = true;
+  bool is_training = true;
+  bool has_addend = false;
+  bool x_requires_grad = true;
+  bool addend_requires_grad = true;
+  bool gamma_requires_grad = true;
+  bool beta_requires_grad = true;
+};
+// training:
+// y, mean, inv_variance = normalization_add_relu(x, Optional(add_end), moving_mean,
+// moving_variance, gamma, beta, axis=1, epsilon=0.01, momentum=0.9) y, mean, inv_variance =
+// normalization_add_relu(x, Optional(add_end), gamma, beta, axis=1, epsilon=0.01, momentum=0.9)
+// inference:
+// y = normalization_add_relu(x, Optional(add_end), moving_mean, moving_variance, gamma, beta,
+// axis=1, epsilon=0.01, momentum=0.9)
+class NormalizationAddReluGrad : public OpExprGradFunction<NormalizationAddReluGradCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override {
+    const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+    CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+    base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Capture(NormalizationAddReluGradCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override {
+    // input_size may be 3/4/5/6, as inputs may be
+    // (x, gamma, beta) or (x, moving_mean, moving_variance, gamma, beta)
+    // (x, addend, gamma, beta) or (x, addend, moving_mean, moving_variance, gamma, beta)
+    // ref to track_running_stats false/true
+    // output_size may be 2 or 4, as outputs may be
+    // (x, reserve_space) or (x, reserve_space, mean, inv_variance)
+    // ref to is_training false/true
+    ctx->x_requires_grad = inputs.at(0)->requires_grad();
+    std::shared_ptr<Tensor> add_end, gamma, beta;
+    if (inputs.size() == 3 || inputs.size() == 5) {
+      add_end = nullptr;
+      if (inputs.size() == 3) {
+        gamma = inputs.at(1);
+        beta = inputs.at(2);
+        ctx->track_running_stats = false;
+      } else {
+        gamma = inputs.at(3);
+        beta = inputs.at(4);
+        ctx->track_running_stats = true;
+      }
+      ctx->has_addend = false;
+    } else if (inputs.size() == 4 || inputs.size() == 6) {
+      add_end = inputs.at(1);
+      if (inputs.size() == 4) {
+        gamma = inputs.at(2);
+        beta = inputs.at(3);
+        ctx->track_running_stats = false;
+      } else {
+        gamma = inputs.at(4);
+        beta = inputs.at(5);
+        ctx->track_running_stats = true;
+      }
+      ctx->has_addend = true;
+      ctx->addend_requires_grad = inputs.at(1)->requires_grad();
+    }
+    ctx->gamma_requires_grad = gamma->requires_grad();
+    ctx->beta_requires_grad = beta->requires_grad();
+    ComposedAttrMap composed_attrs(attrs, base_attrs_);
+    ctx->axis = JUST(composed_attrs.GetAttr<int32_t>("axis"));
+    ctx->epsilon = JUST(composed_attrs.GetAttr<float>("epsilon"));
+    ctx->is_training = JUST(composed_attrs.GetAttr<bool>("training"));
+    ctx->SaveTensorForBackward(inputs.at(0));  // x 0
+    ctx->SaveTensorForBackward(gamma);         // gamma 1
+    ctx->SaveTensorForBackward(beta);          // beta 2
+    if (ctx->is_training || !ctx->track_running_stats) {
+      ctx->SaveTensorForBackward(outputs.at(2));  // mean 3
+      ctx->SaveTensorForBackward(outputs.at(3));  // inv_variance 4
+    } else {
+      if (inputs.size() == 5) {
+        // without add_end
+        ctx->SaveTensorForBackward(inputs.at(1));  // moving_mean 3
+        ctx->SaveTensorForBackward(inputs.at(2));  // moving_variance 4
+      } else {
+        CHECK_EQ_OR_RETURN(inputs.size(), 6);  // NOLINT(maybe-need-error-msg)
+        // with add_end
+        ctx->SaveTensorForBackward(inputs.at(2));  // moving_mean 3
+        ctx->SaveTensorForBackward(inputs.at(3));  // moving_variance 4
+      }
+    }
+    ctx->SaveTensorForBackward(outputs.at(0));  // y 5
+    ctx->SaveTensorForBackward(outputs.at(1));  // reserve space 6
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const NormalizationAddReluGradCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    const auto& x = ctx->SavedTensors().at(0);      // x
+    const auto& gamma = ctx->SavedTensors().at(1);  // gamma
+    const auto& beta = ctx->SavedTensors().at(2);   // beta
+    const auto& y_grad = out_grads.at(0);
+    std::shared_ptr<Tensor> mean, inv_variance;
+    if (ctx->is_training || !ctx->track_running_stats) {
+      mean = ctx->SavedTensors().at(3);          // mean
+      inv_variance = ctx->SavedTensors().at(4);  // inv_variance
+    } else {
+      const auto& moving_mean = ctx->SavedTensors().at(3);      // moving_mean
+      const auto& moving_variance = ctx->SavedTensors().at(4);  // moving_variance
+      const auto& add_eps = JUST(
+          functional::ScalarAdd(moving_variance, ctx->epsilon, /*alpha=*/1, /*inplace=*/false));
+      mean = moving_mean;
+      inv_variance = JUST(functional::Rsqrt(add_eps));
+    }
+    const auto& y = ctx->SavedTensors().at(5);
+    const auto& reserve_space = ctx->SavedTensors().at(6);
+    const auto& results = JUST(functional::NormalizationAddReluGrad(
+        x, y_grad, mean, inv_variance, gamma, beta, reserve_space, y, ctx->axis, ctx->epsilon,
+        ctx->has_addend));
+    CHECK_EQ_OR_RETURN(results->size(), (ctx->has_addend ? 4 : 3))
+        << Error::RuntimeError() << "The number of results is expected to be "
+        << (ctx->has_addend ? 4 : 3) << ", but got "
+        << results->size();  // here output includes "gamma_diff" "beta_diff" "dx" "addend_diff"
+    if (ctx->track_running_stats) {
+      // The normalization op has 5 inputs which are x, moving_mean, moving_variance, gamma and
+      // beta. or 6 inputs: x, add_end, moving_mean, moving_variance, gamma and beta.
+      if (ctx->has_addend) {
+        in_grads->resize(6);
+        if (ctx->gamma_requires_grad) {
+          in_grads->at(4) = results->at(1);  // gamma_diff;
+        }
+        if (ctx->beta_requires_grad) {
+          in_grads->at(5) = results->at(2);  // beta_diff
+        }
+        if (ctx->addend_requires_grad) {
+          in_grads->at(1) = results->at(3);  // add_end_diff
+        }
+      } else {
+        in_grads->resize(5);
+        if (ctx->gamma_requires_grad) {
+          in_grads->at(3) = results->at(1);  // gamma_diff;
+        }
+        if (ctx->beta_requires_grad) {
+          in_grads->at(4) = results->at(2);  // beta_diff
+        }
+      }
+    } else {
+      // The normalization op has 3 inputs which are x, addend, gamma and beta.
+      // or has 4 inputs which are x, addend, gamma and beta.
+      if (ctx->has_addend) {
+        in_grads->resize(4);
+        if (ctx->addend_requires_grad) {
+          in_grads->at(1) = results->at(3);  // addend_diff
+        }
+        if (ctx->gamma_requires_grad) {
+          in_grads->at(2) = results->at(1);  // gamma_diff;
+        }
+        if (ctx->beta_requires_grad) {
+          in_grads->at(3) = results->at(2);  // beta_diff
+        }
+      } else {
+        in_grads->resize(3);
+        if (ctx->gamma_requires_grad) {
+          in_grads->at(1) = results->at(1);  // gamma_diff;
+        }
+        if (ctx->beta_requires_grad) {
+          in_grads->at(2) = results->at(2);  // beta_diff
+        }
+      }
+    }
+    if (!ctx->x_requires_grad) { return Maybe<void>::Ok(); }
+    if (ctx->is_training) {
+      in_grads->at(0) = results->at(0);
+      return Maybe<void>::Ok();
+    }
+    // todo(zzk): add eval mode.
+    return Maybe<void>::Ok();
+  }
+ private:
+  AttrMap base_attrs_;
+};
+REGISTER_OP_EXPR_GRAD_FUNCTION("normalization_add_relu", NormalizationAddReluGrad);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/padding.cpp
+++ b/oneflow/core/autograd/gradient_funcs/padding.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/functional/functional.h"
+#include "oneflow/core/common/container_util.h"
+namespace oneflow {
+namespace one {
+struct Pad2dCaptureState : public AutoGradCaptureState {
+  bool requires_grad;
+  std::vector<int64_t> paddings;
+};
+class Pad2d : public OpExprGradFunction<Pad2dCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override {
+    const UserOpExpr* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+    CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+    base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Capture(Pad2dCaptureState* ctx, const TensorTuple& inputs, const TensorTuple& outputs,
+                      const AttrMap& attrs) const override {
+    CHECK_EQ_OR_RETURN(inputs.size(), 1);   // NOLINT(maybe-need-error-msg)
+    CHECK_EQ_OR_RETURN(outputs.size(), 1);  // NOLINT(maybe-need-error-msg)
+    ctx->requires_grad = JUST(VectorAt(inputs, 0))->requires_grad();
+    if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+    ComposedAttrMap composed_attrs(attrs, base_attrs_);
+    ctx->paddings = JUST(composed_attrs.GetAttr<std::vector<int64_t>>("padding"));
+    return Maybe<void>::Ok();
+  }
+ private:
+  AttrMap base_attrs_;
+};
+class ReflectionPad2d : public Pad2d {
+ public:
+  Maybe<void> Apply(const Pad2dCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
+    in_grads->resize(1);
+    if (ctx->requires_grad) {
+      (*in_grads)[0] =
+          JUST(functional::PadGrad(JUST(VectorAt(out_grads, 0)), ctx->paddings, "reflect", 0));
+    }
+    return Maybe<void>::Ok();
+  }
+};
+class ReplicationPad2d : public Pad2d {
+ public:
+  Maybe<void> Apply(const Pad2dCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
+    in_grads->resize(1);
+    if (ctx->requires_grad) {
+      (*in_grads)[0] =
+          JUST(functional::PadGrad(JUST(VectorAt(out_grads, 0)), ctx->paddings, "replicate", 0));
+    }
+    return Maybe<void>::Ok();
+  }
+};
+struct ConstantPadNdCaptureState : public AutoGradCaptureState {
+  bool requires_grad;
+  std::vector<int64_t> paddings;
+};
+class ConstantPadNd : public OpExprGradFunction<ConstantPadNdCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override {
+    const UserOpExpr* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+    CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+    base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Capture(ConstantPadNdCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override {
+    CHECK_EQ_OR_RETURN(inputs.size(), 1);   // NOLINT(maybe-need-error-msg)
+    CHECK_EQ_OR_RETURN(outputs.size(), 1);  // NOLINT(maybe-need-error-msg)
+    const std::shared_ptr<Tensor>& input_0 = JUST(VectorAt(inputs, 0));
+    ctx->requires_grad = input_0->requires_grad();
+    if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+    ComposedAttrMap composed_attrs(attrs, base_attrs_);
+    ctx->paddings = JUST(composed_attrs.GetAttr<std::vector<int64_t>>("padding"));
+    for (int i = 0; i < ctx->paddings.size(); i++) { ctx->paddings[i] = -ctx->paddings[i]; }
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const ConstantPadNdCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    CHECK_EQ_OR_RETURN(out_grads.size(), 1);  // NOLINT(maybe-need-error-msg)
+    in_grads->resize(1);
+    if (ctx->requires_grad) {
+      (*in_grads)[0] =
+          JUST(functional::Pad(JUST(VectorAt(out_grads, 0)), ctx->paddings, "constant", Scalar(0)));
+    }
+    return Maybe<void>::Ok();
+  }
+ private:
+  AttrMap base_attrs_;
+};
+REGISTER_OP_EXPR_GRAD_FUNCTION("pad", ConstantPadNd);
+REGISTER_OP_EXPR_GRAD_FUNCTION("reflection_pad2d", ReflectionPad2d);
+REGISTER_OP_EXPR_GRAD_FUNCTION("replication_pad2d", ReplicationPad2d);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/partial_fc_sample.cpp
+++ b/oneflow/core/autograd/gradient_funcs/partial_fc_sample.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/attr_map.h"
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/functional/functional.h"
+namespace oneflow {
+namespace one {
+struct PartialFCSampleState : public AutoGradCaptureState {
+  bool requires_grad = false;
+  int32_t index_sampled_label = -1;
+  int32_t index_weight = -1;
+};
+class PartialFCSample : public OpExprGradFunction<PartialFCSampleState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override;
+  Maybe<void> Capture(PartialFCSampleState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override;
+  Maybe<void> Apply(const PartialFCSampleState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override;
+ private:
+  AttrMap base_attrs_;
+};
+Maybe<void> PartialFCSample::Init(const OpExpr& op) {
+  const UserOpExpr* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+  CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+  base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+  return Maybe<void>::Ok();
+}
+Maybe<void> PartialFCSample::Capture(PartialFCSampleState* ctx, const TensorTuple& inputs,
+                                     const TensorTuple& outputs, const AttrMap& attrs) const {
+  ctx->requires_grad = inputs.at(0)->requires_grad();
+  if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+  ctx->index_sampled_label = ctx->SaveTensorForBackward(outputs.at(1));  // sampled_label
+  ctx->index_weight = ctx->SaveTensorForBackward(inputs.at(0));
+  return Maybe<void>::Ok();
+}
+Maybe<void> PartialFCSample::Apply(const PartialFCSampleState* ctx, const TensorTuple& out_grads,
+                                   TensorTuple* in_grads) const {
+  CHECK_EQ_OR_RETURN(out_grads.size(), 3);  // NOLINT(maybe-need-error-msg)
+  in_grads->resize(2);
+  if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+  const auto& diff_sampled_weight = out_grads.at(2);  // diff of sampled_weight
+  const auto& sampled_tensor = ctx->SavedTensors().at(ctx->index_sampled_label);
+  const auto& weight = ctx->SavedTensors().at(ctx->index_weight);
+  const auto& out_tensors_of_op0 = JUST(
+      functional::DistributedPariticalFCSampleDisableBoxing(diff_sampled_weight, sampled_tensor));
+  const auto& out_tensors_of_op1 = JUST(functional::UnsortedSegmentSumLike(
+      out_tensors_of_op0->at(0), out_tensors_of_op0->at(1), weight, 0));
+  in_grads->at(0) = out_tensors_of_op1;
+  return Maybe<void>::Ok();
+}
+REGISTER_OP_EXPR_GRAD_FUNCTION("distributed_partial_fc_sample", PartialFCSample);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/reduce_ops.cpp
+++ b/oneflow/core/autograd/gradient_funcs/reduce_ops.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/attr_map.h"
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_builder.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/functional/functional.h"
+#include "oneflow/core/functional/sequence_function.h"
+namespace oneflow {
+namespace one {
+struct ReduceSumCaptureState : public AutoGradCaptureState {
+  std::vector<int32_t> axis;
+};
+class ReduceSum : public OpExprGradFunction<ReduceSumCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override;
+  Maybe<void> Capture(ReduceSumCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override;
+  Maybe<void> Apply(const ReduceSumCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override;
+ private:
+  AttrMap base_attrs_;
+};
+Maybe<void> ReduceSum::Init(const OpExpr& op) {
+  const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+  CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+  base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+  return Maybe<void>::Ok();
+}
+Maybe<void> ReduceSum::Capture(ReduceSumCaptureState* ctx, const TensorTuple& inputs,
+                               const TensorTuple& outputs, const AttrMap& attrs) const {
+  ComposedAttrMap composed_attrs(attrs, base_attrs_);
+  ctx->axis = JUST(composed_attrs.GetAttr<std::vector<int32_t>>("axis"));
+  ctx->SaveTensorForBackward(inputs.at(0));
+  return Maybe<void>::Ok();
+}
+Maybe<void> ReduceSum::Apply(const ReduceSumCaptureState* ctx, const TensorTuple& out_grads,
+                             TensorTuple* in_grads) const {
+  const auto& input = ctx->SavedTensors().at(0);
+  const auto& dy = out_grads.at(0);
+  in_grads->resize(1);
+  in_grads->at(0) = JUST(functional::BroadcastLike(dy, input, ctx->axis));
+  return Maybe<void>::Ok();
+}
+REGISTER_OP_EXPR_GRAD_FUNCTION("reduce_sum", ReduceSum);
+struct ReduceProdOpInterpState : public AutoGradCaptureState {
+  std::vector<int32_t> axis;
+  bool requires_grad;
+};
+class ReduceProdOp : public OpExprGradFunction<ReduceProdOpInterpState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override;
+  Maybe<void> Capture(ReduceProdOpInterpState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override;
+  Maybe<void> Apply(const ReduceProdOpInterpState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override;
+ private:
+  AttrMap base_attrs_;
+};
+Maybe<void> ReduceProdOp::Init(const OpExpr& op) {
+  const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+  CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+  base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+  return Maybe<void>::Ok();
+}
+Maybe<void> ReduceProdOp::Capture(ReduceProdOpInterpState* ctx, const TensorTuple& inputs,
+                                  const TensorTuple& outputs, const AttrMap& attrs) const {
+  ComposedAttrMap composed_attrs(attrs, base_attrs_);
+  ctx->axis = JUST(composed_attrs.GetAttr<std::vector<int32_t>>("axis"));
+  ctx->requires_grad = inputs.at(0)->requires_grad();
+  ctx->SaveTensorForBackward(inputs.at(0));
+  ctx->SaveTensorForBackward(outputs.at(0));
+  return Maybe<void>::Ok();
+}
+Maybe<void> ReduceProdOp::Apply(const ReduceProdOpInterpState* ctx, const TensorTuple& out_grads,
+                                TensorTuple* in_grads) const {
+  if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+  const auto& input = ctx->SavedTensors().at(0);
+  const auto& output = ctx->SavedTensors().at(1);
+  const auto& dy = out_grads.at(0);
+  in_grads->resize(1);
+  in_grads->at(0) = JUST(
+      functional::SequenceFunction<Maybe<Tensor>()>([&]() { return functional::Mul(dy, output); })
+          .then(std::bind(functional::BroadcastLike, std::placeholders::_1, input, ctx->axis))
+          .then(std::bind(functional::Div, std::placeholders::_1, input))
+          .call());
+  return Maybe<void>::Ok();
+}
+REGISTER_OP_EXPR_GRAD_FUNCTION("reduce_prod", ReduceProdOp);
+struct ReduceMaxOrMinCaptureState : public AutoGradCaptureState {
+  std::vector<int32_t> axis;
+  bool keepdims;
+};
+class ReduceMaxOrMin : public OpExprGradFunction<ReduceMaxOrMinCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override;
+  Maybe<void> Capture(ReduceMaxOrMinCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override;
+  Maybe<void> Apply(const ReduceMaxOrMinCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override;
+ private:
+  AttrMap base_attrs_;
+};
+Maybe<void> ReduceMaxOrMin::Init(const OpExpr& op) {
+  const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+  CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+  base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+  return Maybe<void>::Ok();
+}
+Maybe<void> ReduceMaxOrMin::Capture(ReduceMaxOrMinCaptureState* ctx, const TensorTuple& inputs,
+                                    const TensorTuple& outputs, const AttrMap& attrs) const {
+  ComposedAttrMap composed_attrs(attrs, base_attrs_);
+  ctx->axis = JUST(composed_attrs.GetAttr<std::vector<int32_t>>("axis"));
+  ctx->keepdims = JUST(composed_attrs.GetAttr<bool>("keepdims"));
+  ctx->SaveTensorForBackward(inputs.at(0));
+  ctx->SaveTensorForBackward(outputs.at(0));
+  return Maybe<void>::Ok();
+}
+Maybe<void> ReduceMaxOrMin::Apply(const ReduceMaxOrMinCaptureState* ctx,
+                                  const TensorTuple& out_grads, TensorTuple* in_grads) const {
+  const auto& input = ctx->SavedTensors().at(0);
+  const auto& output = ctx->SavedTensors().at(1);
+  const auto& dy = out_grads.at(0);
+  const auto cast_like =
+      JUST(functional::SequenceFunction<Maybe<Tensor>()>(
+               [&]() { return functional::BroadcastLike(output, input, ctx->axis); })
+               .then(std::bind(functional::BroadcastEqual, input, std::placeholders::_1))
+               .then(std::bind(functional::CastLike, std::placeholders::_1, input))
+               .call());
+  const auto& bcast_like_div =
+      JUST(functional::SequenceFunction<Maybe<Tensor>()>(
+               [&]() { return functional::ReduceSum(cast_like, ctx->axis, ctx->keepdims); })
+               .then(std::bind(functional::Div, dy, std::placeholders::_1))
+               .then(std::bind(functional::BroadcastLike, std::placeholders::_1, input, ctx->axis))
+               .call());
+  in_grads->resize(1);
+  in_grads->at(0) = JUST(functional::Mul(bcast_like_div, cast_like));
+  return Maybe<void>::Ok();
+}
+REGISTER_OP_EXPR_GRAD_FUNCTION("reduce_min", ReduceMaxOrMin);
+REGISTER_OP_EXPR_GRAD_FUNCTION("reduce_max", ReduceMaxOrMin);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/reshape.cpp
+++ b/oneflow/core/autograd/gradient_funcs/reshape.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_builder.h"
+#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
+#include "oneflow/core/framework/op_expr.h"
+#include "oneflow/core/functional/functional.h"
+namespace oneflow {
+namespace one {
+struct ReshapeCaptureState : public AutoGradCaptureState {
+  DimVector input_shape_vec;
+};
+class ReshapeOpExprGrad : public OpExprGradFunction<ReshapeCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override {
+    const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+    CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Capture(ReshapeCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override {
+    ctx->input_shape_vec = inputs.at(0)->shape()->dim_vec();
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const ReshapeCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    in_grads->resize(1);
+    Shape shape(ctx->input_shape_vec);
+    in_grads->at(0) = JUST(functional::Reshape(out_grads.at(0), shape));
+    return Maybe<void>::Ok();
+  }
+};
+REGISTER_OP_EXPR_GRAD_FUNCTION("reshape", ReshapeOpExprGrad);
+}  // namespace one
+}  // namespace oneflow
--- a/oneflow/core/autograd/gradient_funcs/roi_align.cpp
+++ b/oneflow/core/autograd/gradient_funcs/roi_align.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/op_expr_grad_function.h"
+#include "oneflow/core/framework/op_expr.h"
+#include "oneflow/core/functional/functional.h"
+namespace oneflow {
+namespace one {
+struct RoiAlignCaptureState : public AutoGradCaptureState {
+  float spatial_scale = 1.0;
+  int32_t pooled_h = 0;
+  int32_t pooled_w = 0;
+  int32_t sampling_ratio = -1;
+  bool aligned = false;
+  bool requires_grad = false;
+};
+class RoiAlign : public OpExprGradFunction<RoiAlignCaptureState> {
+ public:
+  Maybe<void> Init(const OpExpr& op) override {
+    const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
+    CHECK_NOTNULL_OR_RETURN(fw_op_expr);  // NOLINT(maybe-need-error-msg)
+    base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Capture(RoiAlignCaptureState* ctx, const TensorTuple& inputs,
+                      const TensorTuple& outputs, const AttrMap& attrs) const override {
+    ctx->requires_grad = inputs.at(0)->requires_grad();
+    if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+    ctx->SaveTensorForBackward(inputs.at(0));
+    ctx->SaveTensorForBackward(inputs.at(1));
+    ComposedAttrMap composed_attrs(attrs, base_attrs_);
+    ctx->spatial_scale = JUST(composed_attrs.GetAttr<float>("spatial_scale"));
+    ctx->pooled_h = JUST(composed_attrs.GetAttr<int32_t>("pooled_h"));
+    ctx->pooled_w = JUST(composed_attrs.GetAttr<int32_t>("pooled_w"));
+    ctx->sampling_ratio = JUST(composed_attrs.GetAttr<int32_t>("sampling_ratio"));
+    ctx->aligned = JUST(composed_attrs.GetAttr<bool>("aligned"));
+    return Maybe<void>::Ok();
+  }
+  Maybe<void> Apply(const RoiAlignCaptureState* ctx, const TensorTuple& out_grads,
+                    TensorTuple* in_grads) const override {
+    if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
+    const auto& x_like = ctx->SavedTensors().at(0);
+    const auto& rois = ctx->SavedTensors().at(1);
+    in_grads->at(0) = JUST(
+        functional::RoiAlignGrad(out_grads.at(0), x_like, rois, ctx->spatial_scale, ctx->pooled_h,
+                                 ctx->pooled_w, ctx->sampling_ratio, ctx->aligned));
+    return Maybe<void>::Ok();
+  }
+ private:
+  AttrMap base_attrs_;
+};
+REGISTER_OP_EXPR_GRAD_FUNCTION("roi_align", RoiAlign);
+}  // namespace one
+}  // namespace oneflow