0.9.1-rocm

a715222c · yuguo · f262efc9 · a715222c · a715222c · f262efc9
Commit a715222c authored Feb 28, 2023 by yuguo
20 changed files
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
 OneFlow API Reference
 ===================================
+Distributed performance (high efficiency) is the core technical difficulty of deep learning frameworks. 
+OneFlow upholds the core concept and architecture of static compilation and streaming parallelism around performance improvement and heterogeneous distributed scaling, solving the challenge of memory wall at cluster level with world-leading technology.
 .. toctree::
    :maxdepth: 1
@@ -12,23 +19,27 @@ OneFlow API Reference
    :caption: OneFlow Python API
    oneflow
+    nn
+    nn.functional
    tensor
    tensor_attributes
-    nn
+    type_info
-    functional
    autograd
    cuda
    distributed
+    distributions
+    hub
    linalg
    nn.init
    optim
-    module
    graph
+    auto_parallel
    image
-    utils
+    utils.data
-    env
+    utils.global_view
-    comm
+    utils.tensor
    one_embedding
+    environment_variables

--- a/docs/source/linalg.rst
+++ b/docs/source/linalg.rst
 oneflow.linalg
 ===================================
-OneFlow linear algebra operations.
----------------------------------
+.. The documentation is referenced from: 
+   https://pytorch.org/docs/1.10/linalg.html
+Common linear algebra operations.
+Matrix Properties
+-----------------
 .. currentmodule:: oneflow.linalg
-.. autofunction:: oneflow.linalg.matrix_norm
+.. autosummary::
-.. autofunction:: oneflow.linalg.norm       
+    :toctree: generated
-.. autofunction:: oneflow.linalg.vector_norm
+    :nosignatures:
+    norm 
+    vector_norm
+    matrix_norm
+    diagonal
+    inv
+    cross
--- a/docs/source/module.rst
+++ b/docs/source/module.rst
-oneflow.nn.Module
-================================================
-Module class for building neural networks
---------------------------------------------------
-.. currentmodule:: oneflow.nn
-.. autoclass:: oneflow.nn.Module
-    :members:
--- a/docs/source/nn.functional.rst
+++ b/docs/source/nn.functional.rst
+oneflow.nn.functional
+===========================================
+.. The documentation is referenced from: https://pytorch.org/docs/1.10/nn.functional.html.
+.. contents:: oneflow.nn.functional
+    :depth: 2
+    :local:
+    :class: this-will-duplicate-information-and-it-is-still-useful-here
+    :backlinks: top
+.. currentmodule:: oneflow.nn.functional
+Convolution functions
+-------------------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    conv1d
+    conv2d
+    conv3d
+    conv_transpose1d
+    conv_transpose2d
+    conv_transpose3d
+    fold
+    unfold
+BatchNorm functions
+--------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    batch_norm
+Pooling functions
+----------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    avg_pool1d
+    avg_pool2d
+    avg_pool3d
+    max_pool1d
+    max_pool2d
+    max_pool3d
+    max_unpool1d
+    max_unpool2d
+    max_unpool3d
+    adaptive_avg_pool1d
+    adaptive_avg_pool2d
+    adaptive_avg_pool3d
+    adaptive_max_pool1d
+    adaptive_max_pool2d
+    adaptive_max_pool3d
+Non-linear activation functions
+-------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    threshold
+    relu
+    hardtanh
+    hardswish
+    relu6
+    elu
+    selu
+    celu
+    leaky_relu
+    prelu
+    glu
+    gelu
+    quick_gelu
+    logsigmoid
+    hardshrink
+    softsign
+    softplus
+    softmax
+    softshrink
+    log_softmax
+    gumbel_softmax
+    tanh
+    sigmoid
+    hardsigmoid
+    silu
+    mish
+    layer_norm
+    normalize
+Linear functions
+----------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    linear
+Dropout functions
+-----------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    dropout
+    dropout1d
+    dropout2d
+    dropout3d
+Sparse functions
+----------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    embedding
+    one_hot
+Distance functions
+----------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    cosine_similarity
+    pairwise_distance
+Loss functions
+--------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    sparse_softmax_cross_entropy
+    cross_entropy
+    l1_loss
+    mse_loss
+    smooth_l1_loss
+    triplet_margin_loss
+    binary_cross_entropy
+    binary_cross_entropy_with_logits
+Vision functions
+----------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    deform_conv2d
+    pad
+    interpolate
+    upsample
+    grid_sample
+    affine_grid
+Greedy decoder
+----------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    ctc_greedy_decoder
--- a/docs/source/nn.init.rst
+++ b/docs/source/nn.init.rst
 oneflow.nn.init
-===================================
+===============
-Operators for initialization
----------------------------------
+.. The documentation is referenced from: 
-.. currentmodule:: oneflow.nn.init
+   https://pytorch.org/docs/1.10/nn.init.html
-.. autofunction:: oneflow.nn.init.xavier_uniform_
+.. currentmodule:: oneflow.nn.init
-.. autofunction:: oneflow.nn.init.xavier_normal_
+.. autofunction:: calculate_gain
-.. autofunction:: oneflow.nn.init.kaiming_uniform_
+.. autofunction:: uniform_
-.. autofunction:: oneflow.nn.init.kaiming_normal_
+.. autofunction:: normal_
-.. autofunction:: oneflow.nn.init.orthogonal_
+.. autofunction:: constant_
+.. autofunction:: ones_
+.. autofunction:: zeros_
+.. autofunction:: xavier_uniform_
+.. autofunction:: xavier_normal_
+.. autofunction:: kaiming_uniform_
+.. autofunction:: kaiming_normal_
+.. autofunction:: trunc_normal_
+.. autofunction:: orthogonal_
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
 oneflow.nn
 ===================================
-Operators for neural networks
+.. The documentation is referenced from: 
+   https://pytorch.org/docs/1.10/nn.html
+These are the basic building blocks for graphs:
+.. contents:: oneflow.nn
+    :depth: 2
+    :local:
+    :class: this-will-duplicate-information-and-it-is-still-useful-here
+    :backlinks: top
+.. currentmodule:: oneflow.nn
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: 
+    Parameter
+Containers
 ----------------------------------
 .. currentmodule:: oneflow.nn
-.. automodule:: oneflow.nn
-    :members: AdaptiveAvgPool1d,
+.. autosummary::
-        AdaptiveAvgPool2d,
+    :toctree: generated
-        AdaptiveAvgPool3d,
+    :nosignatures:
-        AvgPool1d,
+    :template: classtemplate.rst
-        AvgPool2d,
-        AvgPool3d,
+    Module
-        BCELoss,
+    Sequential
-        BCEWithLogitsLoss,
+    ModuleList
-        BatchNorm1d,
+    ModuleDict
-        BatchNorm2d,
+    ParameterList
-        BatchNorm3d,
+    ParameterDict
-        COCOReader,
-        CTCLoss,
+nn.Module
-        CoinFlip,
+----------------------------------
-        ConstantPad1d,
+.. currentmodule:: oneflow.nn.Module
-        ConstantPad2d,
-        ConstantPad3d,
+.. autosummary::
-        Conv1d,
+    :toctree: generated
-        Conv2d,
+    :nosignatures:
-        Conv3d,
-        ConvTranspose1d,
+    add_module
-        ConvTranspose2d,
+    apply
-        ConvTranspose3d,
+    buffers
-        CosineSimilarity,
+    children
-        CombinedMarginLoss,
+    cpu
-        CropMirrorNormalize,
+    cuda
-        CrossEntropyLoss,
+    double
-        Dropout,
+    train
-        ELU,
+    eval
-        CELU,
+    extra_repr
-        Embedding,
+    float
-        Flatten,
+    forward
-        GELU,
+    load_state_dict
-        GLU,
+    modules
-        GroupNorm,
+    named_buffers
-        Hardsigmoid,
+    named_children
-        Hardshrink,
+    named_modules
-        Hardswish,
+    named_parameters
-        Hardtanh,
+    parameters
-        Identity,
+    register_buffer
-        InstanceNorm1d,
+    register_forward_hook
-        InstanceNorm2d,
+    register_forward_pre_hook
-        InstanceNorm3d,
+    register_parameter
-        KLDivLoss,
+    requires_grad_
-        L1Loss,
+    state_dict
-        LayerNorm,
+    to
-        LeakyReLU,
+    zero_grad
-        Linear,
-        LogSigmoid,
-        LogSoftmax,
-        MSELoss,
+Containers
-        MarginRankingLoss,
-        TripletMarginLoss,
+Convolution Layers
-        MaxPool1d,
+----------------------------------
-        MaxPool2d,
+.. currentmodule:: oneflow
-        MaxPool3d,
+.. autosummary::
-        ModuleDict,
+    :toctree: generated
-        ModuleList,
+    :nosignatures:
-        Mish,
+    :template: classtemplate.rst
-        NLLLoss,
-        OFRecordImageDecoder,
+    nn.Conv1d 
-        OFRecordImageDecoderRandomCrop,
+    nn.Conv2d 
-        OFRecordRawDecoder,
+    nn.Conv3d
-        OFRecordReader,
+    nn.ConvTranspose1d 
-        OFRecordBytesDecoder,
+    nn.ConvTranspose2d 
-        PReLU,
+    nn.ConvTranspose3d
-        Parameter,
+    nn.Unfold
-        ParameterDict,
+    nn.Fold
-        ParameterList,
-        PixelShuffle,
+Pooling Layers
-        ReLU,
+----------------------------------
-        ReLU6,
-        ReflectionPad2d,
+.. autosummary::
-        ReplicationPad2d,
+    :toctree: generated
-        Sequential, 
+    :nosignatures:
-        SELU, 
+    :template: classtemplate.rst
-        SiLU, 
-        Sigmoid,
+    nn.MaxPool1d 
-        SmoothL1Loss,
+    nn.MaxPool2d 
-        Softmax,
+    nn.MaxPool3d 
-        Softplus, 
+    nn.MaxUnpool1d
-        Softshrink,
+    nn.MaxUnpool2d
-        Softsign, 
+    nn.MaxUnpool3d
-        Tanh,
+    nn.AdaptiveAvgPool1d 
-        Threshold,
+    nn.AdaptiveAvgPool2d 
-        Upsample,
+    nn.AdaptiveAvgPool3d
-        UpsamplingBilinear2d,
+    nn.AdaptiveMaxPool1d 
-        UpsamplingNearest2d,
+    nn.AdaptiveMaxPool2d 
-        ZeroPad2d,
+    nn.AdaptiveMaxPool3d
-        MinMaxObserver,
+    nn.AvgPool1d 
-        MovingAverageMinMaxObserver,
+    nn.AvgPool2d 
-        FakeQuantization,
+    nn.AvgPool3d
-        Quantization, 
-        FusedBatchNorm1d, 
+Padding Layers
-        FusedBatchNorm2d, 
+----------------------------------
-        FusedBatchNorm3d, 
-        FusedMLP, 
+.. autosummary::
+    :toctree: generated
-.. autofunction:: oneflow.nn.modules.pixelshuffle.PixelShufflev2
+    :nosignatures:
+    :template: classtemplate.rst
-.. autofunction:: oneflow.nn.parallel.DistributedDataParallel
+    nn.ConstantPad1d 
+    nn.ConstantPad2d 
+    nn.ConstantPad3d
+    nn.ReflectionPad1d
+    nn.ReflectionPad2d
+    nn.ReplicationPad1d
+    nn.ReplicationPad2d
+    nn.ZeroPad2d
+Non-linear Activations (weighted sum, nonlinearity)
+----------------------------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    nn.ELU 
+    nn.Hardshrink
+    nn.Hardsigmoid 
+    nn.Hardswish 
+    nn.Hardtanh 
+    nn.LeakyReLU 
+    nn.LogSigmoid 
+    nn.PReLU 
+    nn.ReLU
+    nn.ReLU6 
+    nn.SELU 
+    nn.CELU 
+    nn.GELU 
+    nn.QuickGELU 
+    nn.SiLU 
+    nn.Sigmoid 
+    nn.Mish 
+    nn.Softplus 
+    nn.Softshrink 
+    nn.Softsign 
+    nn.Tanh 
+    nn.Threshold 
+    nn.GLU
+Non-linear Activations (other)
+----------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    nn.Softmax
+    nn.LogSoftmax
+Normalization Layers
+----------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    nn.BatchNorm1d 
+    nn.BatchNorm2d 
+    nn.BatchNorm3d
+    nn.SyncBatchNorm
+    nn.FusedBatchNorm1d 
+    nn.FusedBatchNorm2d
+    nn.FusedBatchNorm3d 
+    nn.GroupNorm 
+    nn.InstanceNorm1d 
+    nn.InstanceNorm2d 
+    nn.InstanceNorm3d 
+    nn.LayerNorm
+    nn.RMSLayerNorm
+    nn.RMSNorm
+Recurrent Layers
+----------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    nn.RNN
+    nn.LSTM
+    nn.GRU
+    nn.RNNCell
+    nn.LSTMCell
+    nn.GRUCell
+Linear Layers
+----------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    nn.Identity
+    nn.Linear
+Dropout Layers
+----------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    nn.Dropout
+    nn.Dropout1d
+    nn.Dropout2d
+    nn.Dropout3d
+Sparse Layers
+----------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    nn.Embedding
+Distance Functions
+------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    nn.CosineSimilarity
+    nn.PairwiseDistance
+Loss Functions
+----------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    nn.BCELoss 
+    nn.BCEWithLogitsLoss 
+    nn.CTCLoss 
+    nn.CombinedMarginLoss 
+    nn.CrossEntropyLoss 
+    nn.KLDivLoss 
+    nn.L1Loss 
+    nn.MSELoss 
+    nn.MarginRankingLoss 
+    nn.NLLLoss 
+    nn.SmoothL1Loss 
+    nn.TripletMarginLoss
+Vision Layers
+----------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    nn.PixelShuffle 
+    nn.Upsample 
+    nn.UpsamplingBilinear2d 
+    nn.UpsamplingNearest2d
+DataParallel Layers (multi-GPU, distributed)
+--------------------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    nn.parallel.DistributedDataParallel
+Data loading and preprocessing Layers
+----------------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    nn.COCOReader
+    nn.CoinFlip
+    nn.CropMirrorNormalize
+    nn.OFRecordBytesDecoder
+    nn.OFRecordImageDecoder
+    nn.OFRecordImageDecoderRandomCrop
+    nn.OFRecordRawDecoder
+    nn.OFRecordReader
+Quantization Aware Training
+--------------------------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    nn.MinMaxObserver
+    nn.MovingAverageMinMaxObserver
+    nn.FakeQuantization
+    nn.QatConv1d
+    nn.QatConv2d
+    nn.QatConv3d
+Utilities
+---------
+From the ``oneflow.nn.utils`` module
 .. currentmodule:: oneflow.nn.utils
-.. autofunction:: oneflow.nn.utils.clip_grad_norm_
+.. autosummary::
-.. autofunction:: oneflow.nn.utils.weight_norm
+    :toctree: generated
-.. autofunction:: oneflow.nn.utils.remove_weight_norm
+    :nosignatures:
+    :template: classtemplate.rst
+    clip_grad_norm_
+    clip_grad_value_
+    weight_norm
+    remove_weight_norm
+Utility functions in other modules
+.. currentmodule:: oneflow
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    nn.utils.rnn.PackedSequence
+    nn.utils.rnn.pack_padded_sequence
+    nn.utils.rnn.pad_packed_sequence
+    nn.utils.rnn.pad_sequence
+    nn.utils.rnn.pack_sequence
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    nn.Flatten
+Quantized Functions
+--------------------
+Quantization refers to techniques for performing computations and 
+storing tensors at lower bitwidths than floating point precision.
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template:
+    nn.FakeQuantization
+    nn.MinMaxObserver
+    nn.MovingAverageMinMaxObserver
+    nn.Quantization
--- a/docs/source/one_embedding.rst
+++ b/docs/source/one_embedding.rst
 oneflow.one_embedding
 ===================================
-OneFlow one_embedding operations.
+Embedding is an important component of recommender system, and it has also spread to many fields outside recommender systems. Each framework provides basic operators for Embedding, for example, ``flow.nn.Embedding`` in OneFlow:
+::
+    import numpy as np
+    import oneflow as flow
+    indices = flow.tensor([[1, 2, 4, 5], [4, 3, 2, 9]], dtype=flow.int)
+    embedding = flow.nn.Embedding(10, 3)
+    y = embedding(indices)
+OneEmbedding is the large-scale Embedding solution that OneFlow provides to solve the problem of large-scale deep recommender systems. OneEmbedding has the following advantages compared to ordinary opeartors:
+    - With Flexible hierarchical storage, OneEmbedding can place the Embedding table on GPU memory, CPU memory or SSD, and allow high-speed devices to be used as caches for low-speed devices to achieve both speed and capacity.
+    - OneEmbedding supports dynamic expansion.
+.. note ::
+    Please refer to `Large-Scale Embedding Solution: OneEmbedding <https://docs.oneflow.org/en/master/cookies/one_embedding.html>`__
+    for a brief introduction to all features related to OneEmbedding.
+Configure Embedding Table 
 ----------------------------------
-.. currentmodule:: oneflow.one_embedding
-.. autoclass:: MultiTableEmbedding
+OneEmbedding supports simultaneous creation of multiple Embedding table. The following codes configured three Embedding tables.
-    :members: forward,
-              save_snapshot,
+.. code-block:: 
-              load_snapshot,
-.. autofunction:: oneflow.one_embedding.MultiTableEmbedding.forward
+    import oneflow as flow
-.. autoclass:: MultiTableMultiColumnEmbedding
+    import oneflow.nn as nn
-    :members: forward,
+    import numpy as np
-              save_snapshot,
-              load_snapshot,
+    tables = [
-.. autofunction:: oneflow.one_embedding.MultiTableMultiColumnEmbedding.forward
+        flow.one_embedding.make_table_options(
-.. autofunction:: oneflow.one_embedding.make_device_mem_store_options
+            flow.one_embedding.make_uniform_initializer(low=-0.1, high=0.1)
-.. autofunction:: oneflow.one_embedding.make_cached_ssd_store_options       
+        ),
-.. autofunction:: oneflow.one_embedding.make_cached_host_mem_store_options
+        flow.one_embedding.make_table_options(
-.. autofunction:: oneflow.one_embedding.make_uniform_initializer
+            flow.one_embedding.make_uniform_initializer(low=-0.05, high=0.05)
-.. autofunction:: oneflow.one_embedding.make_normal_initializer
+        ),
+        flow.one_embedding.make_table_options(
+            flow.one_embedding.make_uniform_initializer(low=-0.15, high=0.15)
+        ),
+    ]
+When configuring the Embedding table, you need to specify the initialization method. The above Embedding tables are initialized in the ``uniform`` method. The result of configuring the Embedding table is stored in the ``tables`` variable
 .. autofunction:: oneflow.one_embedding.make_table_options
 .. autofunction:: oneflow.one_embedding.make_table
+initialization method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. currentmodule:: oneflow.one_embedding
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    make_uniform_initializer
+    make_normal_initializer
+Configure the Storage Attribute of the Embedding Table
+--------------------------------------------------------------------
+Then run the following codes to configure the storage attribute of the Embedding table:
+.. code-block:: 
+    store_options = flow.one_embedding.make_cached_ssd_store_options(
+    cache_budget_mb=8142,
+    persistent_path="/your_path_to_ssd", 
+    capacity=40000000,
+    size_factor=1,              
+    physical_block_size=4096
+    )
+Storage Method
+^^^^^^^^^^^^^^^^^^^^
+.. currentmodule:: oneflow.one_embedding
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    make_device_mem_store_options
+    make_cached_ssd_store_options 
+    make_cached_host_mem_store_options
+.. note ::
+    Please refer to `Large-Scale Embedding Solution: OneEmbedding <https://docs.oneflow.org/en/master/cookies/one_embedding.html#feature-id-and-dynamic-insertion>`__
+    for a brief introduction to learn about How to Choose the Proper Storage Configuration
+Instantiate Embedding
+--------------------------------------------------------------------
+After the above configuration is completed, you can use MultiTableEmbedding to get the instantiated Embedding layer.
+.. code-block:: 
+    embedding_size = 128
+    embedding = flow.one_embedding.MultiTableEmbedding(
+        name="my_embedding",
+        embedding_dim=embedding_size,
+        dtype=flow.float,
+        key_type=flow.int64,
+        tables=tables,
+        store_options=store_options,
+    )
+    embedding.to("cuda")
+.. note ::
+    Please refer to `Large-Scale Embedding Solution: OneEmbedding <https://docs.oneflow.org/en/master/cookies/one_embedding.html#feature-id-and-multi-table-query>`__
+    for a brief introduction to learn about Feature ID and Multi-Table Query.
+MultiTableEmbedding
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. autofunction:: oneflow.one_embedding.MultiTableEmbedding
+.. currentmodule:: oneflow.one_embedding.MultiTableEmbedding
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    forward
+    save_snapshot
+    load_snapshot
+MultiTableMultiColumnEmbedding
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. autofunction:: oneflow.one_embedding.MultiTableMultiColumnEmbedding
+.. currentmodule:: oneflow.one_embedding.MultiTableMultiColumnEmbedding
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    forward
+    save_snapshot
+    load_snapshot
+Construct Graph for Training
+--------------------------------------------------------------------
+OneEmbedding is only supported in Graph mode.
+.. code-block:: 
+    num_tables = 3
+    mlp = flow.nn.FusedMLP(
+        in_features=embedding_size * num_tables,
+        hidden_features=[512, 256, 128],
+        out_features=1,
+        skip_final_activation=True,
+    )
+    mlp.to("cuda")
+    class TrainGraph(flow.nn.Graph):
+        def __init__(self,):
+            super().__init__()
+            self.embedding_lookup = embedding
+            self.mlp = mlp
+            self.add_optimizer(
+                flow.optim.SGD(self.embedding_lookup.parameters(), lr=0.1, momentum=0.0)
+            )
+            self.add_optimizer(
+                flow.optim.SGD(self.mlp.parameters(), lr=0.1, momentum=0.0)
+            )
+        def build(self, ids):
+            embedding = self.embedding_lookup(ids)
+            loss = self.mlp(flow.reshape(embedding, (-1, num_tables * embedding_size)))
+            loss = loss.sum()
+            loss.backward()
+            return loss
+.. note ::
+    Please refer to `Distributed Training: OneEmbedding <https://docs.oneflow.org/en/master/parallelism/01_introduction.html>`__
+    for a brief introduction to learn about Graph For Training
+Persistent Read & Write
+-----------------------------------------------
+.. currentmodule:: oneflow.one_embedding
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    make_persistent_table_reader
+    make_persistent_table_writer
 .. automodule:: oneflow.one_embedding
    :members: Ftrl
-.. autofunction:: oneflow.one_embedding.make_persistent_table_reader
-.. autofunction:: oneflow.one_embedding.make_persistent_table_writer
--- a/docs/source/oneflow.rst
+++ b/docs/source/oneflow.rst
 oneflow
 ===================================
-oneflow
----------------------------------
+.. The documentation is referenced from: 
+   https://pytorch.org/docs/1.10/torch.html
+The oneflow package contains data structures for multi-dimensional tensors and defines mathematical operations over these tensors. Additionally, it provides many utilities for efficient serializing of Tensors and arbitrary types, and other useful utilities.
+It has a CUDA counterpart, that enables you to run your tensor computations on an NVIDIA GPU with compute capability >= 3.0
 .. currentmodule:: oneflow
-.. automodule:: oneflow
-    :members: adaptive_avg_pool1d, 
-            adaptive_avg_pool2d, 
+Tensor
-            adaptive_avg_pool3d, 
+-------------------------------------------
-            abs, 
-            acos, 
+.. autosummary::
-            acosh, 
+    :toctree: generated
-            add, 
+    :nosignatures:
-            addcmul, 
-            addmm, 
+    BoolTensor
-            all, 
+    ByteTensor
-            amin, 
+    CharTensor
-            amax,
+    DoubleTensor
-            any, 
+    FloatTensor
-            arccos, 
+    HalfTensor
-            arcsin, 
+    IntTensor
-            arcsinh, 
+    LongTensor
-            arccosh, 
-            arctan, 
-            arctanh, 
+.. autosummary::
-            argmax, 
+    :toctree: generated
-            argmin, 
+    :nosignatures:
-            arange, 
-            argsort, 
+    is_tensor
-            argwhere,
+    is_floating_point
-            asin,  
+    is_nonzero
-            asinh, 
+    numel
-            atan, 
+    set_printoptions
-            atan2, 
+    get_default_dtype
-            atanh, 
+    set_default_dtype
-            bernoulli,
+    set_default_tensor_type
-            broadcast_like, 
-            batch_gather,
+.. _tensor-creation-ops:
-            bmm,
-            cat, 
+Creation Ops
-            concat,
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-            cast, 
-            ceil, 
+.. note::
-            chunk, 
+    Random sampling creation ops are listed under :ref:`random-sampling` and
-            clamp, 
+    include:
-            clip, 
+    :func:`oneflow.rand`
-            cos, 
+    :func:`oneflow.randn`
-            cosh, 
+    :func:`oneflow.randint`
-            diag, 
+    :func:`oneflow.randperm`
-            select,
-            diagonal,
+.. autosummary::
-            movedim,
+    :toctree: generated
-            tensor_split,
+    :nosignatures:
-            hsplit,
-            vsplit,
+    tensor
-            as_strided,
+    as_tensor
-            div, 
+    as_strided
-            dot, 
+    from_numpy
-            eq,
+    zeros
-            einsum,
+    zeros_like
-            equal, 
+    ones
-            expand, 
+    ones_like
-            eye,
+    randn_like
-            exp, 
+    randint_like
-            expm1, 
+    masked_fill
-            erf, 
+    new_ones
-            erfc, 
+    arange
-            erfinv, 
+    linspace
-            flatten, 
+    eye
-            flip, 
+    empty
-            floor, 
+    empty_like
-            floor_,
+    full
-            fmod,
+    full_like
-            full, 
+    tensor_scatter_nd_update
-            gather, 
+    logspace
-            gather_nd, 
-            gelu, 
+.. _indexing-slicing-joining:
-            gt, 
-            in_top_k, 
+Indexing, Slicing, Joining, Mutating Ops
-            index_select,
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-            linspace,
-            logical_and,
+.. autosummary::
-            logical_or,
+    :toctree: generated
-            logical_not,
+    :nosignatures:
-            logical_xor,
-            load, 
+    argwhere
-            log, 
+    atleast_1d
-            log2,
+    atleast_2d
-            log1p, 
+    atleast_3d
-            lt, 
+    cat
-            le, 
+    column_stack
-            masked_fill, 
+    concat
-            masked_select, 
+    chunk
-            matmul, 
+    dstack
-            mm, 
+    expand
-            mv, 
+    gather
-            narrow, 
+    gather_nd
-            max, 
+    batch_gather
-            mean,
+    hsplit
-            median,
+    hstack
-            mish,  
+    vsplit
-            min, 
+    vstack
-            meshgrid,
+    index_select
-            mul, 
+    index_add
-            neg, 
+    masked_select
-            negative, 
+    movedim
-            new_ones,
+    narrow
-            nonzero,
+    nonzero
-            normal,
+    permute
-            numel, 
+    repeat
-            ne, 
+    reshape
-            empty,
+    row_stack
-            ones, 
+    select
-            ones_like, 
+    scatter
-            pow,
+    scatter_add
-            prod,  
+    scatter_nd
-            rand,
+    slice
-            randn,
+    slice_update
-            repeat, 
+    split
-            repeat_interleave,
+    squeeze
-            reshape, 
+    stack
-            randint,
+    swapaxes
-            randperm,
+    swapdims
-            reciprocal,
+    t
-            roc_auc_score,
+    tile
-            roll,
+    transpose
-            round, 
+    unbind
-            rsqrt,
+    unsqueeze
-            save, 
+    where
-            scatter,
+    tensor_split
-            scatter_add,
-            scatter_nd, 
+.. _random-sampling:
-            tensor_scatter_nd_update,
-            sin, 
+Random sampling
-            sin_, 
+-------------------------------------------
-            sinh, 
-            sign, 
+.. autosummary::
-            selu, 
+    :toctree: generated
-            silu, 
+    :nosignatures:
-            slice, 
-            slice_update,
+    seed
-            softsign, 
+    manual_seed
-            sort, 
+    initial_seed
-            softplus, 
+    get_rng_state
-            sigmoid, 
+    set_rng_state
-            softmax, 
+    bernoulli
-            squeeze, 
+    normal
-            split, 
+    rand
-            stack, 
+    randint
-            std,
+    randn
-            sub, 
+    randperm
-            sum, 
+    multinomial
-            sqrt, 
-            square,  
+In-place random sampling
-            swapaxes, 
+~~~~~~~~~~~~~~~~~~~~~~~~
-            swapdims, 
-            tan, 
+There are a few more in-place random sampling functions defined on Tensors as well. Click through to refer to their documentation:
-            tanh, 
+- :func:`oneflow.Tensor.normal_` - in-place version of :func:`oneflow.normal`
-            tensor, 
+- :func:`oneflow.Tensor.uniform_` - numbers sampled from the continuous uniform distribution
-            tensordot,
-            tile, 
-            transpose,
-            t,
+Serialization
-            tril, 
+-------------------------------------------
-            unsqueeze, 
-            unbind, 
+.. autosummary::
-            permute,
+    :toctree: generated
-            var, 
+    :nosignatures:
-            where, 
-            zeros, 
+    save
-            zeros_like,
+    load
-            is_nonzero,
-            is_tensor,
+Parallelism
-            no_grad,
+-------------------------------------------
-            set_grad_enabled,
-            enable_grad,
+.. autosummary::
-            inference_mode,
+    :toctree: generated
-            is_grad_enabled,
+    :nosignatures:
-            is_floating_point,
-            set_printoptions,
+    set_num_threads
-            decode_onerec,
-            from_numpy,
-            as_tensor,
+Locally disabling gradient computation
-            cumsum,
+-------------------------------------------
-            topk,
+The context managers :func:`oneflow.no_grad`, :func:`oneflow.enable_grad`, and
-            nms,
+:func:`oneflow.set_grad_enabled` are helpful for locally disabling and enabling
-            cumprod,
+gradient computation. These context managers are thread local, so they won't
-            HalfTensor,
+work if you send work to another thread using the ``threading`` module, etc.
-            FloatTensor,
-            DoubleTensor,
+Examples::
-            BoolTensor,
-            ByteTensor,
+  >>> import oneflow
-            CharTensor,
+  >>> x = oneflow.zeros(1, requires_grad=True)
-            IntTensor,
+  >>> with oneflow.no_grad():
-            LongTensor,
+  ...     y = x * 2
-            seed,
+  >>> y.requires_grad
-            manual_seed,
+  False
-            initial_seed,
-            get_rng_state,
+  >>> with oneflow.set_grad_enabled(False):
-            set_rng_state,
+  ...     y = x * 2
-            isnan,
+  >>> y.requires_grad
-            isinf,
+  False
-            searchsorted
+  >>> with oneflow.set_grad_enabled(True):
-.. autofunction:: oneflow.relu
+  ...     y = x * 2
-.. autofunction:: oneflow.set_num_threads
+  >>> y.requires_grad
+  True
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    no_grad
+    set_grad_enabled
+    enable_grad
+    is_grad_enabled
+    inference_mode
+Math operations
+-------------------------------------------
+Pointwise Ops
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    abs 
+    acos 
+    acosh 
+    arccos 
+    arccosh
+    add 
+    addcdiv
+    addcmul
+    asin 
+    asinh 
+    arcsin 
+    arcsinh 
+    atan
+    atanh 
+    arctan 
+    arctanh 
+    atan2 
+    ceil 
+    clamp 
+    clamp_min
+    clamp_max
+    clip 
+    cos 
+    cosh 
+    div 
+    erf 
+    erfc 
+    erfinv
+    exp 
+    expm1 
+    floor 
+    floor_ 
+    fmod 
+    gelu
+    quick_gelu
+    log 
+    log1p 
+    log2 
+    log10
+    logical_and 
+    logical_not 
+    logical_or 
+    logical_xor 
+    mish
+    mul 
+    neg 
+    negative 
+    pow 
+    reciprocal 
+    round 
+    rsqrt 
+    selu
+    softmax
+    softplus
+    softsign
+    silu
+    sigmoid 
+    sign 
+    sin 
+    sinh 
+    sin_ 
+    sqrt 
+    square 
+    sub 
+    tan 
+    tanh
+    trunc
+    floor_divide
+Reduction Ops
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    argmax  
+    argmin  
+    amax
+    amin
+    any
+    max
+    min  
+    mean  
+    median
+    prod
+    nansum
+    std  
+    sum  
+    logsumexp
+    var
+    norm
+    all
+Comparison Ops
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    argsort 
+    eq 
+    equal 
+    gt 
+    isinf 
+    isnan 
+    le 
+    lt 
+    ne 
+    sort 
+    topk
+    ge
+    greater
+    greater_equal
+    maximum
+    minimum
+    not_equal
+    isclose
+    allclose
+Spectral Ops
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    hann_window
+Other Ops
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    adaptive_avg_pool1d
+    adaptive_avg_pool2d
+    adaptive_avg_pool3d
+    broadcast_like 
+    cast
+    cumprod 
+    cumsum 
+    decode_onerec
+    diag 
+    diagonal 
+    einsum 
+    flatten 
+    flip 
+    in_top_k
+    meshgrid 
+    nms
+    roc_auc_score
+    roll 
+    searchsorted
+    tensordot
+    tril
+    repeat_interleave
+    triu
+    cross
+    bincount
+    broadcast_shapes
+    broadcast_tensors
+    broadcast_to
+    unique
+BLAS and LAPACK Operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    addmm 
+    bmm
+    baddbmm 
+    dot 
+    matmul
+    mm
+    mv
--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
 oneflow.optim
 ===================================
-Optimizers
----------------------------------
+.. The documentation is referenced from: 
+   https://pytorch.org/docs/1.10/optim.html
+oneflow.optim is a package implementing various optimization algorithms. Most commonly used methods are already supported, and the interface is general enough, so that more sophisticated ones can be also easily integrated in the future.
+How to use an optimizer
+-----------------------
+To use :mod:`oneflow.optim` you have to construct an optimizer object, that will hold
+the current state and will update the parameters based on the computed gradients.
+Constructing it
+^^^^^^^^^^^^^^^
+To construct an :class:`Optimizer` you have to give it an iterable containing the
+parameters (all should be :class:`~oneflow.autograd.Variable` s) to optimize. Then,
+you can specify optimizer-specific options such as the learning rate, weight decay, etc.
+.. note::
+    If you need to move a model to GPU via ``.cuda()``, please do so before 
+    constructing optimizers for it. Parameters of a model after ``.cuda()`` 
+    will be different objects with those before the call.
+    In general, you should make sure that optimized parameters live in 
+    consistent locations when optimizers are constructed and used. 
+Example::
+    import oneflow
+    import oneflow.nn as nn
+    import oneflow.optim as optim
+    model = nn.Linear(16, 3)
+    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+Per-parameter options
+^^^^^^^^^^^^^^^^^^^^^
+:class:`Optimizer` also support specifying per-parameter options. To do this, instead
+of passing an iterable of :class:`~oneflow.autograd.Variable`, pass in an iterable of
+:class:`dict`. Each of them will define a separate parameter group, and should contain
+a ``params`` key, containing a list of parameters belonging to it. Other keys
+should match the keyword arguments accepted by the optimizers, and will be used
+as optimization options for this group.
+.. note::
+    You can still pass options as keyword arguments. They will be used as
+    defaults, in the groups that didn't override them. This is useful when you
+    only want to vary a single option, while keeping all others consistent
+    between parameter groups.
+For example, this is very useful when one wants to specify per-layer learning rates::
+    import oneflow.nn as nn
+    import oneflow.optim as optim
+    class Model(nn.Module):
+        def __init__(self):
+            super(Model, self).__init__()
+            self.base = nn.Linear(64, 32)
+            self.classifier = nn.Linear(32, 10)
+        def forward(self, x):
+            out = self.base(x)
+            out = self.classifier(out)
+            return out
+    model = Model()
+    optim.SGD(
+        [
+            {"params": model.base.parameters()},
+            {"params": model.classifier.parameters(), "lr": 1e-3},
+        ],
+        lr=1e-2,
+        momentum=0.9,
+    )
+This means that ``model.base``'s parameters will use the default learning rate of ``1e-2``,
+``model.classifier``'s parameters will use a learning rate of ``1e-3``, and a momentum of
+``0.9`` will be used for all parameters.
+Taking an optimization step
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+All optimizers implement a :func:`~Optimizer.step` method, that updates the
+parameters. It can be used in two ways:
+``optimizer.step()``
+~~~~~~~~~~~~~~~~~~~~
+This is a simplified version supported by most optimizers. The function can be
+called once the gradients are computed using e.g.
+:func:`~oneflow.autograd.Variable.backward`.
+Example::
+    import oneflow
+    import oneflow.nn as nn
+    import oneflow.nn.functional as F
+    import oneflow.optim as optim
+    from oneflow.utils.data import Dataset, DataLoader
+    class CustomDataset(Dataset):
+        def __init__(self, num):
+            self.inputs = oneflow.randn(num, 1)
+            self.targets = oneflow.sin(self.inputs)
+        def __len__(self):
+            return self.inputs.shape[0]
+        def __getitem__(self, index):
+            return self.inputs[index], self.targets[index]
+    class Model(nn.Module):
+        def __init__(self, input_size):
+            super(Model, self).__init__()
+            self.linear1 = nn.Linear(input_size, 64)
+            self.linear2 = nn.Linear(64, input_size)
+        def forward(self, x):
+            out = self.linear1(x)
+            return self.linear2(F.relu(out))
+    dataset = CustomDataset(10000)
+    dataloader = DataLoader(dataset, batch_size=10)
+    model = Model(1)
+    loss_fn = nn.MSELoss()
+    optimizer = optim.SGD(model.parameters(), lr=1e-3)
+    for epoch in range(100):
+        for input, target in dataloader:
+            optimizer.zero_grad()
+            output = model(input)
+            loss = loss_fn(output, target)
+            loss.backward()
+            optimizer.step()
+.. _optimizer-algorithms:
 .. currentmodule:: oneflow.optim
-.. automodule:: oneflow.optim
-    :members: Adam,
-        Adagrad, 
+Base class
-        AdamW,
+----------
-        Optimizer,
-        RMSprop,
+.. autoclass:: Optimizer
-        SGD,
-        LAMB,
+.. autosummary::
-        lr_scheduler
+    :toctree: generated
+    :nosignatures:
-.. automodule:: oneflow.optim.lr_scheduler
-    :members: CosineDecayLR,
+    Optimizer.add_param_group
-            CosineAnnealingLR,
+    Optimizer.load_state_dict
-            LambdaLR,
+    Optimizer.state_dict
-            StepLR,
+    Optimizer.step
-            MultiStepLR,
+    Optimizer.zero_grad
-            ExponentialLR,
-            ReduceLROnPlateau,
+Algorithms
-            PolynomialLR
+----------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    Adagrad
+    Adam
+    AdamW
+    LAMB
+    RMSprop
+    SGD
+Adjust Learning Rate
+--------------------
+:mod:`oneflow.optim.lr_scheduler` provides several methods to adjust the learning
+rate based on the number of epochs. :class:`oneflow.optim.lr_scheduler.ReduceLROnPlateau`
+allows dynamic learning rate reducing based on some validation measurements.
+Learning rate scheduling should be applied after optimizer's update; e.g., you
+should write your code this way:
+Example::
+    import oneflow
+    import oneflow.nn as nn
+    import oneflow.nn.functional as F
+    import oneflow.optim as optim
+    from oneflow.utils.data import Dataset, DataLoader
+    class CustomDataset(Dataset):
+        def __init__(self, num):
+            self.inputs = oneflow.randn(num, 1)
+            self.targets = oneflow.sin(self.inputs)
+        def __len__(self):
+            return self.inputs.shape[0]
+        def __getitem__(self, index):
+            return self.inputs[index], self.targets[index]
+    class Model(nn.Module):
+        def __init__(self, input_size):
+            super(Model, self).__init__()
+            self.linear1 = nn.Linear(input_size, 64)
+            self.linear2 = nn.Linear(64, input_size)
+        def forward(self, x):
+            out = self.linear1(x)
+            return self.linear2(F.relu(out))
+    dataset = CustomDataset(10000)
+    dataloader = DataLoader(dataset, batch_size=10)
+    model = Model(1)
+    loss_fn = nn.MSELoss()
+    optimizer = optim.SGD(model.parameters(), lr=1e-3)
+    scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
+    for epoch in range(20):
+        for input, target in dataloader:
+            optimizer.zero_grad()
+            output = model(input)
+            loss = loss_fn(output, target)
+            loss.backward()
+            optimizer.step()
+        scheduler.step()
+Most learning rate schedulers can be chained (also referred to as
+chaining schedulers).
+Example::
+    import oneflow
+    import oneflow.nn as nn
+    import oneflow.nn.functional as F
+    import oneflow.optim as optim
+    from oneflow.utils.data import Dataset, DataLoader
+    class CustomDataset(Dataset):
+        def __init__(self, num):
+            self.inputs = oneflow.randn(num, 1)
+            self.targets = oneflow.sin(self.inputs)
+        def __len__(self):
+            return self.inputs.shape[0]
+        def __getitem__(self, index):
+            return self.inputs[index], self.targets[index]
+    class Model(nn.Module):
+        def __init__(self, input_size):
+            super(Model, self).__init__()
+            self.linear1 = nn.Linear(input_size, 64)
+            self.linear2 = nn.Linear(64, input_size)
+        def forward(self, x):
+            out = self.linear1(x)
+            return self.linear2(F.relu(out))
+    dataset = CustomDataset(10000)
+    dataloader = DataLoader(dataset, batch_size=10)
+    model = Model(1)
+    loss_fn = nn.MSELoss()
+    optimizer = optim.SGD(model.parameters(), lr=1e-3)
+    scheduler1 = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
+    scheduler2 = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[5, 10], gamma=0.1)
+    for epoch in range(20):
+        for input, target in dataloader:
+            optimizer.zero_grad()
+            output = model(input)
+            loss = loss_fn(output, target)
+            loss.backward()
+            optimizer.step()
+        scheduler1.step()
+        scheduler2.step()
+In many places in the documentation, we will use the following template to refer to schedulers
+algorithms.
+    >>> scheduler = ...
+    >>> for epoch in range(100):
+    >>>     train(...)
+    >>>     validate(...)
+    >>>     scheduler.step()
+.. warning::
+  If you use the learning rate scheduler (calling ``scheduler.step()``) before the optimizer's update
+  (calling ``optimizer.step()``), this will skip the first value of the learning rate schedule. Please 
+  check if you are calling ``scheduler.step()`` at the wrong time.
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    lr_scheduler.CosineAnnealingLR
+    lr_scheduler.CosineDecayLR 
+    lr_scheduler.ExponentialLR 
+    lr_scheduler.LambdaLR 
+    lr_scheduler.MultiStepLR
+    lr_scheduler.PolynomialLR 
+    lr_scheduler.ReduceLROnPlateau 
+    lr_scheduler.StepLR
+    lr_scheduler.ConstantLR
+    lr_scheduler.LinearLR
+    lr_scheduler.ChainedScheduler
+    lr_scheduler.SequentialLR
+    lr_scheduler.CosineAnnealingWarmRestarts
--- a/docs/source/tensor.rst
+++ b/docs/source/tensor.rst
 oneflow.Tensor
 ===================================
-OneFlow Tensor Class
----------------------------------
+.. The documentation is referenced from: 
+   https://pytorch.org/docs/1.10/tensors.html
+A :class:`oneflow.Tensor` is a multi-dimensional matrix containing elements of
+a single data type.
 .. currentmodule:: oneflow
-.. autoclass:: oneflow.Tensor
-    :members: abs, 
-            acos, 
-            acosh, 
-            add, 
-            add_, 
-            addcmul,
-            addcmul_,
-            addmm,
-            amin,
-            amax,
-            arccos, 
-            arccosh, 
-            arcsin, 
-            arcsinh, 
-            arctan, 
-            arctanh, 
-            argmax, 
-            argmin, 
-            argsort, 
-            argwhere, 
-            asin, 
-            asinh, 
-            atan, 
-            atan2, 
-            atanh, 
-            backward,
-            bmm, 
-            byte, 
-            cast, 
-            ceil, 
-            chunk,  
-            clamp, 
-            clamp_,
-            clip, 
-            clip_, 
-            clone, 
-            copy_, 
-            cos, 
-            cosh, 
-            cpu, 
-            cuda,
-            data, 
-            dot,
-            detach, 
-            device, 
-            placement,
-            sbp,
-            diag, 
-            diagonal,
-            dim, 
-            div, 
-            div_, 
-            double, 
-            dtype, 
-            element_size, 
-            eq, 
-            erf, 
-            erfc, 
-            erfinv, 
-            erfinv_, 
-            exp, 
-            expand, 
-            expand_as, 
-            expm1, 
-            fill_, 
-            flatten, 
-            flip, 
-            float, 
-            floor, 
-            floor_, 
-            fmod,
-            gather, 
-            ge, 
-            gelu, 
-            get_device, 
-            grad, 
-            grad_fn, 
-            gt, 
-            half,
-            in_top_k, 
-            index_select,
-            int, 
-            is_global, 
-            is_contiguous, 
-            is_cuda, 
-            is_floating_point, 
-            is_lazy, 
-            is_leaf, 
-            item, 
-            le, 
-            log, 
-            log1p,
-            logical_and,
-            logical_or,
-            logical_not,
-            logical_xor,
-            long, 
-            lt, 
-            masked_fill, 
-            masked_select, 
-            matmul, 
-            mm, 
-            mv, 
-            max, 
-            mean, 
-            min, 
-            mish, 
-            mul, 
-            mul_, 
-            narrow, 
-            ndim, 
-            ndimension, 
-            ne, 
-            negative, 
-            nelement, 
-            new_empty,
-            new_ones, 
-            new_zeros,
-            nonzero,
-            norm, 
-            normal_, 
-            numel, 
-            numpy, 
-            permute, 
-            pow, 
-            prod,
-            reciprocal, 
-            register_hook, 
-            relu,
-            repeat,
-            repeat_interleave,
-            requires_grad,
-            requires_grad_,
-            reshape, 
-            retain_grad,
-            roll,
-            round, 
-            rsqrt, 
-            selu, 
-            shape, 
-            sigmoid, 
-            sign, 
-            silu, 
-            sin, 
-            sin_, 
-            sinh, 
-            size, 
-            softmax, 
-            softplus, 
-            softsign, 
-            sort, 
-            split, 
-            sqrt, 
-            square, 
-            squeeze, 
-            std, 
-            storage_offset, 
-            stride, 
-            sum,
-            swapaxes, 
-            swapdims, 
-            sub, 
-            sub_, 
-            tan, 
-            tanh, 
-            tile, 
-            to,
-            local_to_global,
-            global_to_global,
-            to_global,
-            to_local,
-            to_consistent,
-            tolist, 
-            topk, 
-            transpose,
-            tril, 
-            triu, 
-            type_as, 
-            type,
-            t,
-            T,
-            unbind, 
-            unfold, 
-            uniform_, 
-            unsqueeze, 
-            var, 
-            view, 
-            view_as, 
-            where, 
-            zero_, 
-            nms,
-            pin_memory,
-            is_pinned,
+Data types
+----------
+OneFlow defines 8 Tensor types with CPU and GPU variants which are as follows:
+======================================= =============================================== =============================== ==================================
+Data type                               dtype                                           CPU tensor                      GPU tensor
+======================================= =============================================== =============================== ==================================
+Boolean                                 ``oneflow.bool``                                :class:`oneflow.BoolTensor`     :class:`oneflow.cuda.BoolTensor`
+8-bit integer (unsigned)                ``oneflow.uint8``                               :class:`oneflow.ByteTensor`     :class:`oneflow.cuda.ByteTensor`
+8-bit integer (signed)                  ``oneflow.int8``                                :class:`oneflow.CharTensor`     :class:`oneflow.cuda.CharTensor`
+64-bit floating point                   ``oneflow.float64`` or ``oneflow.double``       :class:`oneflow.DoubleTensor`   :class:`oneflow.cuda.DoubleTensor`
+32-bit floating point                   ``oneflow.float32`` or ``oneflow.float``        :class:`oneflow.FloatTensor`    :class:`oneflow.cuda.FloatTensor`
+16-bit floating point                   ``oneflow.float16`` or ``oneflow.half``         :class:`oneflow.HalfTensor`     :class:`oneflow.cuda.HalfTensor`
+32-bit integer (signed)                 ``oneflow.int32`` or ``oneflow.int``            :class:`oneflow.IntTensor`      :class:`oneflow.cuda.IntTensor`
+64-bit integer (signed)                 ``oneflow.int64`` or ``oneflow.long``           :class:`oneflow.LongTensor`     :class:`oneflow.cuda.LongTensor`
+======================================= =============================================== =============================== ==================================
+Initializing and basic operations
+---------------------------------
+A tensor can be constructed from a Python :class:`list` or sequence using the
+:func:`oneflow.tensor` constructor:
+::
+    >>> import oneflow
+    >>> import numpy as np
+    >>> oneflow.tensor([[1., -1.], [1., -1.]])
+    tensor([[ 1., -1.],
+            [ 1., -1.]], dtype=oneflow.float32)
+    >>> oneflow.tensor(np.array([[1, 2, 3], [4, 5, 6]]))
+    tensor([[ 1, 2, 3],
+            [ 4, 5, 6]], dtype=oneflow.int64)
+.. warning::
+    :func:`oneflow.tensor` always copies :attr:`data`. If you have a Tensor
+    :attr:`data` and just want to change its ``requires_grad`` flag, use
+    :meth:`~oneflow.Tensor.requires_grad_` or
+    :meth:`~oneflow.Tensor.detach` to avoid a copy.
+    If you have a numpy array and want to avoid a copy, use
+    :func:`oneflow.as_tensor`.
+.. A tensor of specific data type can be constructed by passing a :class:`oneflow.dtype` and/or a :class:`oneflow.device` to a constructor or tensor creation op:
+::
+    >>> import oneflow
+    >>> oneflow.zeros([2, 4], dtype=oneflow.int32)
+    tensor([[ 0, 0, 0, 0],
+            [ 0, 0, 0, 0]], dtype=oneflow.int32)
+    >>> cuda0 = oneflow.device('cuda:0')
+    >>> oneflow.ones([2, 4], dtype=oneflow.float64, device=cuda0)
+    tensor([[ 1., 1., 1., 1.],
+            [ 1., 1., 1., 1.]], device='cuda:0', dtype=oneflow.float64)
+For more information about building tensors, see :ref:`tensor-creation-ops`
+The contents of a tensor can be accessed and modified using Python's indexing
+and slicing notation:
+::
+    >>> import oneflow
+    >>> x = oneflow.tensor([[1, 2, 3], [4, 5, 6]])
+    >>> print(x[1][2])
+    tensor(6, dtype=oneflow.int64)
+    >>> x[0][1] = 8
+    >>> print(x)
+    tensor([[1, 8, 3],
+            [4, 5, 6]], dtype=oneflow.int64)
+Use :meth:`oneflow.Tensor.item` to get a Python number from a tensor containing a
+single value:
+::
+    >>> import oneflow
+    >>> x = oneflow.tensor([[1]])
+    >>> x
+    tensor([[1]], dtype=oneflow.int64)
+    >>> x.item()
+    1
+    >>> x = oneflow.tensor(2.5)
+    >>> x
+    tensor(2.5000, dtype=oneflow.float32)
+    >>> x.item()
+    2.5
+For more information about indexing, see :ref:`indexing-slicing-joining`
+A tensor can be created with :attr:`requires_grad=True` so that
+:mod:`oneflow.autograd` records operations on them for automatic differentiation.
+::
+    >>> import oneflow
+    >>> x = oneflow.tensor([[1., -1.], [1., 1.]], requires_grad=True)
+    >>> out = x.pow(2).sum()
+    >>> out.backward()
+    >>> x.grad
+    tensor([[ 2., -2.],
+            [ 2.,  2.]], dtype=oneflow.float32)
+.. note::
+   For more information on the :class:`oneflow.dtype`, :class:`oneflow.device`, and
+   :class:`oneflow.layout` attributes of a :class:`oneflow.Tensor`, see
+   :ref:`tensor-attributes-doc`.
+.. note::
+   Methods which mutate a tensor are marked with an underscore suffix.
+   For example, :func:`oneflow.FloatTensor.add_` computes the absolute value
+   in-place and returns the modified tensor, while :func:`oneflow.FloatTensor.add`
+   computes the result in a new tensor.
+.. note::
+    To change an existing tensor's :class:`oneflow.device` and/or :class:`oneflow.dtype`, consider using
+    :meth:`~oneflow.Tensor.to` method of Tensor object.
+.. warning::
+   Current implementation of :class:`oneflow.Tensor` introduces memory overhead,
+   thus it might lead to unexpectedly high memory usage in the applications with many tiny tensors.
+   If this is your case, consider using one large structure.
+Tensor class reference
+----------------------
+.. class:: Tensor()
+   There are a few main ways to create a tensor, depending on your use case.
+   - To create a tensor with pre-existing data, use :func:`oneflow.tensor`.
+   - To create a tensor with specific size, use ``oneflow.*`` tensor creation
+     ops (see :ref:`tensor-creation-ops`).
+   - To create a tensor with the same size (and similar types) as another tensor,
+     use ``oneflow.*_like`` tensor creation ops
+     (see :ref:`tensor-creation-ops`).
+.. currentmodule:: oneflow
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    Tensor.new_empty
+    Tensor.new_ones 
+    Tensor.new_zeros
+    Tensor.new_full
+    Tensor.new_tensor
+    Tensor.is_cuda
+    Tensor.is_global
+    Tensor.device
+    Tensor.grad
+    Tensor.ndim
+    Tensor.abs
+    Tensor.acos
+    Tensor.acosh
+    Tensor.add
+    Tensor.add_
+    Tensor.addcdiv
+    Tensor.addcdiv_
+    Tensor.addcmul
+    Tensor.addcmul_
+    Tensor.addmm
+    Tensor.all
+    Tensor.amin
+    Tensor.amax
+    Tensor.any
+    Tensor.arccos
+    Tensor.arccosh
+    Tensor.arcsin
+    Tensor.arcsinh
+    Tensor.arctan
+    Tensor.arctanh
+    Tensor.argmax
+    Tensor.argmin
+    Tensor.argsort
+    Tensor.argwhere
+    Tensor.asin
+    Tensor.asinh
+    Tensor.atan
+    Tensor.atan2
+    Tensor.atanh
+    Tensor.backward
+    Tensor.bmm
+    Tensor.bool
+    Tensor.byte
+    Tensor.cast
+    Tensor.ceil
+    Tensor.chunk
+    Tensor.clamp
+    Tensor.clamp_
+    Tensor.clip
+    Tensor.clip_
+    Tensor.clone
+    Tensor.contiguous
+    Tensor.copy_
+    Tensor.cos
+    Tensor.cosh
+    Tensor.cpu
+    Tensor.cuda
+    Tensor.cumprod
+    Tensor.cumsum
+    Tensor.data
+    Tensor.dot
+    Tensor.detach
+    Tensor.placement
+    Tensor.sbp
+    Tensor.diag
+    Tensor.diagonal
+    Tensor.dim
+    Tensor.div
+    Tensor.div_
+    Tensor.double
+    Tensor.dtype 
+    Tensor.element_size
+    Tensor.eq
+    Tensor.equal
+    Tensor.erf
+    Tensor.erfc
+    Tensor.erfinv
+    Tensor.erfinv_
+    Tensor.exp
+    Tensor.expand
+    Tensor.expand_as
+    Tensor.expm1
+    Tensor.fill_
+    Tensor.flatten
+    Tensor.flip
+    Tensor.float
+    Tensor.floor
+    Tensor.floor_
+    Tensor.floor_divide
+    Tensor.fmod
+    Tensor.gather
+    Tensor.ge
+    Tensor.get_device
+    Tensor.grad_fn
+    Tensor.gt
+    Tensor.half
+    Tensor.in_top_k
+    Tensor.index_select
+    Tensor.index_add
+    Tensor.index_add_
+    Tensor.int
+    Tensor.is_contiguous
+    Tensor.is_floating_point
+    Tensor.is_lazy
+    Tensor.is_leaf
+    Tensor.isinf
+    Tensor.isnan
+    Tensor.item
+    Tensor.le
+    Tensor.log
+    Tensor.log1p
+    Tensor.log2
+    Tensor.log10
+    Tensor.logical_and
+    Tensor.logical_or
+    Tensor.logical_not
+    Tensor.logical_xor
+    Tensor.long
+    Tensor.lt
+    Tensor.masked_fill
+    Tensor.masked_fill_
+    Tensor.masked_select
+    Tensor.matmul
+    Tensor.mm
+    Tensor.mv
+    Tensor.max
+    Tensor.maximum
+    Tensor.median
+    Tensor.mean
+    Tensor.min
+    Tensor.minimum
+    Tensor.mish
+    Tensor.mul
+    Tensor.mul_
+    Tensor.nansum
+    Tensor.narrow
+    Tensor.ndimension
+    Tensor.ne
+    Tensor.neg
+    Tensor.negative
+    Tensor.nelement
+    Tensor.nonzero
+    Tensor.norm
+    Tensor.normal_
+    Tensor.numel
+    Tensor.numpy
+    Tensor.permute
+    Tensor.pow
+    Tensor.prod
+    Tensor.reciprocal
+    Tensor.register_hook
+    Tensor.relu
+    Tensor.repeat
+    Tensor.repeat_interleave
+    Tensor.requires_grad
+    Tensor.requires_grad_
+    Tensor.reshape
+    Tensor.reshape_as
+    Tensor.retain_grad
+    Tensor.roll
+    Tensor.round
+    Tensor.rsqrt
+    Tensor.selu
+    Tensor.shape
+    Tensor.sigmoid
+    Tensor.sign
+    Tensor.silu
+    Tensor.sin
+    Tensor.sin_
+    Tensor.sinh
+    Tensor.size
+    Tensor.softmax
+    Tensor.softplus
+    Tensor.softsign
+    Tensor.sort
+    Tensor.split
+    Tensor.sqrt
+    Tensor.square
+    Tensor.squeeze
+    Tensor.squeeze_
+    Tensor.std
+    Tensor.storage_offset
+    Tensor.stride
+    Tensor.logsumexp
+    Tensor.sum
+    Tensor.swapaxes
+    Tensor.swapdims
+    Tensor.sub
+    Tensor.sub_
+    Tensor.tan
+    Tensor.tanh
+    Tensor.tile
+    Tensor.to
+    Tensor.local_to_global
+    Tensor.global_to_global
+    Tensor.to_global
+    Tensor.to_local
+    Tensor.to_consistent
+    Tensor.tolist
+    Tensor.topk
+    Tensor.transpose
+    Tensor.tril
+    Tensor.triu
+    Tensor.trunc
+    Tensor.type_as
+    Tensor.type
+    Tensor.t
+    Tensor.T
+    Tensor.unbind
+    Tensor.unfold
+    Tensor.uniform_
+    Tensor.unsqueeze
+    Tensor.unsqueeze_
+    Tensor.as_strided
+    Tensor.as_strided_
+    Tensor.var
+    Tensor.view
+    Tensor.view_as
+    Tensor.where
+    Tensor.zero_
+    Tensor.nms
+    Tensor.pin_memory
+    Tensor.is_pinned
+    Tensor.cross
+    Tensor.scatter
+    Tensor.scatter_
+    Tensor.scatter_add
+    Tensor.scatter_add_
+    Tensor.bernoulli
+    Tensor.bernoulli_
+    Tensor.bincount
+    Tensor.isclose
+    Tensor.allclose
+    Tensor.broadcast_to
+    Tensor.unique
--- a/docs/source/tensor_attributes.rst
+++ b/docs/source/tensor_attributes.rst
 .. currentmodule:: oneflow
+.. _tensor-attributes-doc:
 Tensor Attributes
 =============================================================
+.. The documentation is referenced from: https://pytorch.org/docs/1.10/tensor_attributes.html.
 Each local ``oneflow.Tensor`` has a :class:`oneflow.dtype`, :class:`oneflow.device`, and global ``oneflow.Tensor`` has a :class:`oneflow.dtype`, :class:`oneflow.placement`, :class:`oneflow.sbp`.
+.. contents:: oneflow
+    :depth: 2
+    :local:
+    :class: this-will-duplicate-information-and-it-is-still-useful-here
+    :backlinks: top
+.. _dtype-doc:
+oneflow.dtype
+-----------------------
+.. class:: dtype
+A :class:`oneflow.dtype` is an object that represents the data type of a
+:class:`oneflow.Tensor`. Oneflow has eight different data types:
+======================================= =============================================== =============================== ==================================
+Data type                               dtype                                           CPU tensor                      GPU tensor
+======================================= =============================================== =============================== ==================================
+Boolean                                 ``oneflow.bool``                                :class:`oneflow.BoolTensor`     :class:`oneflow.cuda.BoolTensor`
+8-bit integer (unsigned)                ``oneflow.uint8``                               :class:`oneflow.ByteTensor`     :class:`oneflow.cuda.ByteTensor`
+8-bit integer (signed)                  ``oneflow.int8``                                :class:`oneflow.CharTensor`     :class:`oneflow.cuda.CharTensor`
+64-bit floating point                   ``oneflow.float64`` or ``oneflow.double``       :class:`oneflow.DoubleTensor`   :class:`oneflow.cuda.DoubleTensor`
+32-bit floating point                   ``oneflow.float32`` or ``oneflow.float``        :class:`oneflow.FloatTensor`    :class:`oneflow.cuda.FloatTensor`
+16-bit floating point                   ``oneflow.float16`` or ``oneflow.half``         :class:`oneflow.HalfTensor`     :class:`oneflow.cuda.HalfTensor`
+32-bit integer (signed)                 ``oneflow.int32`` or ``oneflow.int``            :class:`oneflow.IntTensor`      :class:`oneflow.cuda.IntTensor`
+64-bit integer (signed)                 ``oneflow.int64`` or ``oneflow.long``           :class:`oneflow.LongTensor`     :class:`oneflow.cuda.LongTensor`
+======================================= =============================================== =============================== ==================================
+To find out if a :class:`oneflow.dtype` is a floating point data type, the property :attr:`is_floating_point`
+can be used, which returns ``True`` if the data type is a floating point data type.
+.. _type-promotion-doc:
+When the dtypes of inputs to an arithmetic operation (`add`, `sub`, `div`, `mul`) differ, we promote
+by finding the minimum dtype that satisfies the following rules:
+* If the type of a scalar operand is of a higher category than tensor operands
+  (where complex > floating > integral > boolean), we promote to a type with sufficient size to hold
+  all scalar operands of that category.
+* If a zero-dimension tensor operand has a higher category than dimensioned operands,
+  we promote to a type with sufficient size and category to hold all zero-dim tensor operands of
+  that category.
+* If there are no higher-category zero-dim operands, we promote to a type with sufficient size
+  and category to hold all dimensioned operands.
+A floating point scalar operand has dtype `oneflow.get_default_dtype()` and an integral
+non-boolean scalar operand has dtype `oneflow.int64`. Unlike numpy, we do not inspect
+values when determining the minimum `dtypes` of an operand.  Quantized and complex types
+are not yet supported.
+Promotion Examples::
+    >>> float_tensor = oneflow.ones(1, dtype=oneflow.float)
+    >>> double_tensor = oneflow.ones(1, dtype=oneflow.double)
+    >>> int_tensor = oneflow.ones(1, dtype=oneflow.int)
+    >>> long_tensor = oneflow.ones(1, dtype=oneflow.long)
+    >>> uint_tensor = oneflow.ones(1, dtype=oneflow.uint8)
+    >>> double_tensor = oneflow.ones(1, dtype=oneflow.double)
+    >>> bool_tensor = oneflow.ones(1, dtype=oneflow.bool)
+    # zero-dim tensors
+    >>> long_zerodim = oneflow.tensor(1, dtype=oneflow.long)
+    >>> int_zerodim = oneflow.tensor(1, dtype=oneflow.int)
+    >>> a,b=oneflow.tensor(5),oneflow.tensor(5)
+    >>> oneflow.add(a, b).dtype
+    oneflow.int64
+    # 5 is an int64, but does not have higher category than int_tensor so is not considered.
+    >>> (int_tensor + 5).dtype
+    oneflow.int32
+    >>> (int_tensor + long_zerodim).dtype
+    oneflow.int64
+    >>> (long_tensor + int_tensor).dtype
+    oneflow.int64
+    >>> (bool_tensor + long_tensor).dtype
+    oneflow.int64
+    >>> (bool_tensor + uint_tensor).dtype
+    oneflow.uint8
+    >>> (float_tensor + double_tensor).dtype
+    oneflow.float64
+    >>> (bool_tensor + int_tensor).dtype
+    oneflow.int32
+    # Since long is a different kind than float, result dtype only needs to be large enough
+    # to hold the float.
+    >>> oneflow.add(long_tensor, float_tensor).dtype
+    oneflow.float32
+When the output tensor of an arithmetic operation is specified, we allow casting to its `dtype` except that:
+  * An integral output tensor cannot accept a floating point tensor.
+  * A boolean output tensor cannot accept a non-boolean tensor.
+  * A non-complex output tensor cannot accept a complex tensor
+Casting Examples::
+    # allowed:
+    >>> float_tensor *= float_tensor
+    >>> float_tensor *= int_tensor
+    >>> float_tensor *= uint_tensor
+    >>> float_tensor *= bool_tensor
+    >>> int_tensor *= uint_tensor
+    # disallowed (RuntimeError: result type can't be cast to the desired output type):
+    >>> float_tensor *= double_tensor
+    >>> int_tensor *= float_tensor
+    >>> int_tensor *= long_tensor
+    >>> uint_tensor *= int_tensor
+    >>> bool_tensor *= int_tensor
+    >>> bool_tensor *= uint_tensor
+.. _device-doc:
 oneflow.device
--------------------------------------------------------------
+------------------------
-.. autoclass:: oneflow.device
+.. class:: device
+A :class:`oneflow.device` is an object representing the device on which a :class:`oneflow.Tensor` is
+or will be allocated.
+The :class:`oneflow.device` contains a device type (``'cpu'`` or ``'cuda'``) and optional device
+ordinal for the device type. If the device ordinal is not present, this object will always represent
+the current device for the device type, even after :func:`oneflow.cuda.set_device()` is called; e.g.,
+a :class:`oneflow.Tensor` constructed with device ``'cuda'`` is equivalent to ``'cuda:X'`` where X is
+the result of :func:`oneflow.cuda.current_device()`.
+A :class:`oneflow.Tensor`'s device can be accessed via the :attr:`Tensor.device` property.
+A :class:`oneflow.device` can be constructed via a string or via a string and device ordinal
+Via a string:
+::
+    >>> oneflow.device('cuda:0')
+    device(type='cuda', index=0)
+    >>> oneflow.device('cpu')
+    device(type='cpu', index=0)
+    >>> oneflow.device('cuda')  # current cuda device
+    device(type='cuda', index=0)
+Via a string and device ordinal:
+::
+    >>> oneflow.device('cuda', 0)
+    device(type='cuda', index=0)
+    >>> oneflow.device('cpu', 0)
+    device(type='cpu', index=0)
+.. note::
+   The :class:`oneflow.device` argument in functions can generally be substituted with a string.
+   This allows for fast prototyping of code.
+   >>> # Example of a function that takes in a oneflow.device
+   >>> cuda1 = oneflow.device('cuda:1')
+   >>> oneflow.randn((2,3), device=cuda1)
+   >>> # You can substitute the oneflow.device with a string
+   >>> oneflow.randn((2,3), device='cuda:1')
+.. note::
+   For legacy reasons, a device can be constructed via a single device ordinal, which is treated
+   as a cuda device.  This matches :meth:`Tensor.get_device`, which returns an ordinal for cuda
+   tensors and is not supported for cpu tensors.
+   >>> oneflow.device(1)
+   device(type='cuda', index=1)
+.. note::
+   Methods which take a device will generally accept a (properly formatted) string
+   or (legacy) integer device ordinal, i.e. the following are all equivalent:
+   >>> oneflow.randn((2,3), device=oneflow.device('cuda:1'))
+   >>> oneflow.randn((2,3), device='cuda:1')
+   >>> oneflow.randn((2,3), device=1)  # legacy
 oneflow.placement
 --------------------------------------------------------------
 .. autoclass:: oneflow.placement
+oneflow.placement.all
+--------------------------------------------------------------
+.. autofunction:: oneflow.placement.all
 oneflow.env.all_device_placement
 --------------------------------------------------------------
 .. autofunction:: oneflow.env.all_device_placement

--- a/docs/source/type_info.rst
+++ b/docs/source/type_info.rst
+.. currentmodule:: oneflow
+.. _type-info-doc:
+Type Info
+=========
+.. The documentation is referenced from: https://pytorch.org/docs/1.10/type_info.html.
+The numerical properties of a :class:`oneflow.dtype` can be accessed through either the :class:`oneflow.finfo` or the :class:`oneflow.iinfo`.
+.. contents:: oneflow
+    :depth: 2
+    :local:
+    :class: this-will-duplicate-information-and-it-is-still-useful-here
+    :backlinks: top
+oneflow.finfo
+-------------
+.. class:: oneflow.finfo
+A :class:`oneflow.finfo` is an object that represents the numerical properties of a floating point :class:`oneflow.dtype`, (i.e. ``oneflow.float32``, ``oneflow.float64`` and ``oneflow.float16``). This is similar to `numpy.finfo <https://numpy.org/doc/stable/reference/generated/numpy.finfo.html>`_.
+A :class:`oneflow.finfo` provides the following attributes:
+================== ======= ========================================================================== 
+Name               Type    Description                                                               
+================== ======= ========================================================================== 
+bits               int     The number of bits occupied by the type.                                  
+eps                float   The smallest representable number such that ``1.0 + eps != 1.0``.             
+min                float   The largest representable number.                                         
+max                float   The smallest representable number (typically ``-max``).                       
+tiny               float   The smallest positive normal number. See notes.
+resolution         float   The approximate decimal resolution of this type, i.e., ``10**-precision``.    
+================== ======= ========================================================================== 
+For example:
+.. code-block::
+    >>> import oneflow as flow
+    >>> flow.finfo()
+    finfo(resolution=1e-06, min=-3.40282e+38, max=3.40282e+38, eps=1.19209e-07, tiny=1.17549e-38, dtype=oneflow.float32, bits=32)
+    >>> flow.finfo(flow.float)
+    finfo(resolution=1e-06, min=-3.40282e+38, max=3.40282e+38, eps=1.19209e-07, tiny=1.17549e-38, dtype=oneflow.float32, bits=32)
+    >>> flow.finfo(flow.float16).bits
+    16
+    >>> flow.finfo(flow.float16).max
+    65504.0
+oneflow.iinfo
+-------------
+.. class:: oneflow.iinfo
+A :class:`oneflow.iinfo` is an object that represents the numerical properties of a integer :class:`oneflow.dtype` (i.e. ``oneflow.uint8``, ``oneflow.int8``, ``oneflow.int16``, ``oneflow.int32``, and ``oneflow.int64``). This is similar to `numpy.iinfo <https://numpy.org/doc/stable/reference/generated/numpy.iinfo.html>`_.
+A :class:`oneflow.iinfo` provides the following attributes:
+================== ======= ========================================================================== 
+Name               Type    Description                                                               
+================== ======= ========================================================================== 
+bits               int     The number of bits occupied by the type.                                  
+min                float   The largest representable number.                                         
+max                float   The smallest representable number.                       
+================== ======= ========================================================================== 
+For example:
+.. code-block ::
+    >>> import oneflow as flow
+    >>> flow.iinfo(flow.int8)
+    iinfo(min=-128, max=127, dtype=oneflow.int8, bits=8)
+    >>> flow.iinfo(flow.int).max
+    2147483647
+    >>> flow.iinfo(flow.int).bits
+    32
--- a/docs/source/utils.data.rst
+++ b/docs/source/utils.data.rst
+oneflow.utils.data
+===================================
+.. The documentation is referenced from: 
+   https://pytorch.org/docs/1.10/data.html
+.. automodule:: oneflow.utils.data
+At the heart of Oneflow data loading utility is the :class:`oneflow.utils.data.DataLoader`
+class.  It represents a Python iterable over a dataset, with support for
+* `map-style and iterable-style datasets <Dataset Types_>`_,
+* `customizing data loading order <Data Loading Order and Sampler_>`_,
+* `automatic batching <Loading Batched and Non-Batched Data_>`_,
+* `single- and multi-process data loading <Single- and Multi-process Data Loading_>`_,
+* `automatic memory pinning <Memory Pinning_>`_.
+These options are configured by the constructor arguments of a
+:class:`~oneflow.utils.data.DataLoader`, which has signature::
+    DataLoader(dataset, batch_size=1, shuffle=False, sampler=None,
+               batch_sampler=None, num_workers=0, collate_fn=None,
+               pin_memory=False, drop_last=False, timeout=0,
+               worker_init_fn=None, *, prefetch_factor=2,
+               persistent_workers=False)
+The sections below describe in details the effects and usages of these options.
+Dataset Types
+-------------
+The most important argument of :class:`~oneflow.utils.data.DataLoader`
+constructor is :attr:`dataset`, which indicates a dataset object to load data
+from. Oneflow supports two different types of datasets:
+* `map-style datasets <Map-style datasets_>`_,
+* `iterable-style datasets <Iterable-style datasets_>`_.
+Map-style datasets
+^^^^^^^^^^^^^^^^^^
+A map-style dataset is one that implements the :meth:`__getitem__` and
+:meth:`__len__` protocols, and represents a map from (possibly non-integral)
+indices/keys to data samples.
+For example, such a dataset, when accessed with ``dataset[idx]``, could read
+the ``idx``-th image and its corresponding label from a folder on the disk.
+See :class:`~oneflow.utils.data.Dataset` for more details.
+Iterable-style datasets
+^^^^^^^^^^^^^^^^^^^^^^^
+An iterable-style dataset is an instance of a subclass of :class:`~oneflow.utils.data.IterableDataset`
+that implements the :meth:`__iter__` protocol, and represents an iterable over
+data samples. This type of datasets is particularly suitable for cases where
+random reads are expensive or even improbable, and where the batch size depends
+on the fetched data.
+For example, such a dataset, when called ``iter(dataset)``, could return a
+stream of data reading from a database, a remote server, or even logs generated
+in real time.
+See :class:`~oneflow.utils.data.IterableDataset` for more details.
+.. note:: When using an :class:`~oneflow.utils.data.IterableDataset` with
+          `multi-process data loading <Multi-process data loading_>`_. The same
+          dataset object is replicated on each worker process, and thus the
+          replicas must be configured differently to avoid duplicated data. See
+          :class:`~oneflow.utils.data.IterableDataset` documentations for how to
+          achieve this.
+Data Loading Order and :class:`~oneflow.utils.data.Sampler`
+-----------------------------------------------------------
+For `iterable-style datasets <Iterable-style datasets_>`_, data loading order
+is entirely controlled by the user-defined iterable. This allows easier
+implementations of chunk-reading and dynamic batch size (e.g., by yielding a
+batched sample at each time).
+The rest of this section concerns the case with
+`map-style datasets <Map-style datasets_>`_. :class:`oneflow.utils.data.Sampler`
+classes are used to specify the sequence of indices/keys used in data loading.
+They represent iterable objects over the indices to datasets.  E.g., in the
+common case with stochastic gradient decent (SGD), a
+:class:`~oneflow.utils.data.Sampler` could randomly permute a list of indices
+and yield each one at a time, or yield a small number of them for mini-batch
+SGD.
+A sequential or shuffled sampler will be automatically constructed based on the :attr:`shuffle` argument to a :class:`~oneflow.utils.data.DataLoader`.
+Alternatively, users may use the :attr:`sampler` argument to specify a
+custom :class:`~oneflow.utils.data.Sampler` object that at each time yields
+the next index/key to fetch.
+A custom :class:`~oneflow.utils.data.Sampler` that yields a list of batch
+indices at a time can be passed as the :attr:`batch_sampler` argument.
+Automatic batching can also be enabled via :attr:`batch_size` and
+:attr:`drop_last` arguments. See
+`the next section <Loading Batched and Non-Batched Data_>`_ for more details
+on this.
+.. note::
+  Neither :attr:`sampler` nor :attr:`batch_sampler` is compatible with
+  iterable-style datasets, since such datasets have no notion of a key or an
+  index.
+Loading Batched and Non-Batched Data
+------------------------------------
+:class:`~oneflow.utils.data.DataLoader` supports automatically collating
+individual fetched data samples into batches via arguments
+:attr:`batch_size`, :attr:`drop_last`, :attr:`batch_sampler`, and
+:attr:`collate_fn` (which has a default function).
+Automatic batching (default)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+This is the most common case, and corresponds to fetching a minibatch of
+data and collating them into batched samples, i.e., containing Tensors with
+one dimension being the batch dimension (usually the first).
+When :attr:`batch_size` (default ``1``) is not ``None``, the data loader yields
+batched samples instead of individual samples. :attr:`batch_size` and
+:attr:`drop_last` arguments are used to specify how the data loader obtains
+batches of dataset keys. For map-style datasets, users can alternatively
+specify :attr:`batch_sampler`, which yields a list of keys at a time.
+.. note::
+  The :attr:`batch_size` and :attr:`drop_last` arguments essentially are used
+  to construct a :attr:`batch_sampler` from :attr:`sampler`. For map-style
+  datasets, the :attr:`sampler` is either provided by user or constructed
+  based on the :attr:`shuffle` argument. For iterable-style datasets, the
+  :attr:`sampler` is a dummy infinite one. See
+  `this section <Data Loading Order and Sampler_>`_ on more details on
+  samplers.
+.. note::
+  When fetching from
+  `iterable-style datasets <Iterable-style datasets_>`_ with
+  `multi-processing <Multi-process data loading_>`_, the :attr:`drop_last`
+  argument drops the last non-full batch of each worker's dataset replica.
+After fetching a list of samples using the indices from sampler, the function
+passed as the :attr:`collate_fn` argument is used to collate lists of samples
+into batches.
+In this case, loading from a map-style dataset is roughly equivalent with::
+    for indices in batch_sampler:
+        yield collate_fn([dataset[i] for i in indices])
+and loading from an iterable-style dataset is roughly equivalent with::
+    dataset_iter = iter(dataset)
+    for indices in batch_sampler:
+        yield collate_fn([next(dataset_iter) for _ in indices])
+A custom :attr:`collate_fn` can be used to customize collation, e.g., padding
+sequential data to max length of a batch. See
+`this section <dataloader-collate_fn_>`_ on more about :attr:`collate_fn`.
+Disable automatic batching
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+In certain cases, users may want to handle batching manually in dataset code,
+or simply load individual samples. For example, it could be cheaper to directly
+load batched data (e.g., bulk reads from a database or reading continuous
+chunks of memory), or the batch size is data dependent, or the program is
+designed to work on individual samples.  Under these scenarios, it's likely
+better to not use automatic batching (where :attr:`collate_fn` is used to
+collate the samples), but let the data loader directly return each member of
+the :attr:`dataset` object.
+When both :attr:`batch_size` and :attr:`batch_sampler` are ``None`` (default
+value for :attr:`batch_sampler` is already ``None``), automatic batching is
+disabled. Each sample obtained from the :attr:`dataset` is processed with the
+function passed as the :attr:`collate_fn` argument.
+**When automatic batching is disabled**, the default :attr:`collate_fn` simply
+converts NumPy arrays into Oneflow Tensors, and keeps everything else untouched.
+In this case, loading from a map-style dataset is roughly equivalent with::
+    for index in sampler:
+        yield collate_fn(dataset[index])
+and loading from an iterable-style dataset is roughly equivalent with::
+    for data in iter(dataset):
+        yield collate_fn(data)
+See `this section <dataloader-collate_fn_>`_ on more about :attr:`collate_fn`.
+.. _dataloader-collate_fn:
+Working with :attr:`collate_fn`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The use of :attr:`collate_fn` is slightly different when automatic batching is
+enabled or disabled.
+**When automatic batching is disabled**, :attr:`collate_fn` is called with
+each individual data sample, and the output is yielded from the data loader
+iterator. In this case, the default :attr:`collate_fn` simply converts NumPy
+arrays in Oneflow tensors.
+**When automatic batching is enabled**, :attr:`collate_fn` is called with a list
+of data samples at each time. It is expected to collate the input samples into
+a batch for yielding from the data loader iterator. The rest of this section
+describes the behavior of the default :attr:`collate_fn`
+(:func:`~oneflow.utils.data.default_collate`).
+For instance, if each data sample consists of a 3-channel image and an integral
+class label, i.e., each element of the dataset returns a tuple
+``(image, class_index)``, the default :attr:`collate_fn` collates a list of
+such tuples into a single tuple of a batched image tensor and a batched class
+label Tensor. In particular, the default :attr:`collate_fn` has the following
+properties:
+* It always prepends a new dimension as the batch dimension.
+* It automatically converts NumPy arrays and Python numerical values into
+  Oneflow Tensors.
+* It preserves the data structure, e.g., if each sample is a dictionary, it
+  outputs a dictionary with the same set of keys but batched Tensors as values
+  (or lists if the values can not be converted into Tensors). Same
+  for ``list`` s, ``tuple`` s, ``namedtuple`` s, etc.
+Users may use customized :attr:`collate_fn` to achieve custom batching, e.g.,
+collating along a dimension other than the first, padding sequences of
+various lengths, or adding support for custom data types.
+If you run into a situation where the outputs of :class:`~oneflow.utils.data.DataLoader`
+have dimensions or type that is different from your expectation, you may
+want to check your :attr:`collate_fn`.
+Single- and Multi-process Data Loading
+--------------------------------------
+A :class:`~oneflow.utils.data.DataLoader` uses single-process data loading by
+default.
+Within a Python process, the
+`Global Interpreter Lock (GIL) <https://wiki.python.org/moin/GlobalInterpreterLock>`_
+prevents true fully parallelizing Python code across threads. To avoid blocking
+computation code with data loading, Oneflow provides an easy switch to perform
+multi-process data loading by simply setting the argument :attr:`num_workers`
+to a positive integer.
+Single-process data loading (default)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+In this mode, data fetching is done in the same process a
+:class:`~oneflow.utils.data.DataLoader` is initialized.  Therefore, data loading
+may block computing.  However, this mode may be preferred when resource(s) used
+for sharing data among processes (e.g., shared memory, file descriptors) is
+limited, or when the entire dataset is small and can be loaded entirely in
+memory.  Additionally, single-process loading often shows more readable error
+traces and thus is useful for debugging.
+Multi-process data loading
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+Setting the argument :attr:`num_workers` as a positive integer will
+turn on multi-process data loading with the specified number of loader worker
+processes.
+.. warning::
+   After several iterations, the loader worker processes will consume
+   the same amount of CPU memory as the parent process for all Python
+   objects in the parent process which are accessed from the worker
+   processes.  This can be problematic if the Dataset contains a lot of
+   data (e.g., you are loading a very large list of filenames at Dataset
+   construction time) and/or you are using a lot of workers (overall
+   memory usage is ``number of workers * size of parent process``).  The
+   simplest workaround is to replace Python objects with non-refcounted
+   representations such as Pandas, Numpy or PyArrow objects. 
+In this mode, each time an iterator of a :class:`~oneflow.utils.data.DataLoader`
+is created (e.g., when you call ``enumerate(dataloader)``), :attr:`num_workers`
+worker processes are created. At this point, the :attr:`dataset`,
+:attr:`collate_fn`, and :attr:`worker_init_fn` are passed to each
+worker, where they are used to initialize, and fetch data. This means that
+dataset access together with its  internal IO, transforms
+(including :attr:`collate_fn`) runs in the worker process.
+For map-style datasets, the main process generates the indices using
+:attr:`sampler` and sends them to the workers. So any shuffle randomization is
+done in the main process which guides loading by assigning indices to load.
+For iterable-style datasets, since each worker process gets a replica of the
+:attr:`dataset` object, naive multi-process loading will often result in
+duplicated data. Using :attr:`worker_init_fn`, users may configure each replica independently. (See
+:class:`~oneflow.utils.data.IterableDataset` documentations for how to achieve
+this. ) For similar reasons, in multi-process loading, the :attr:`drop_last`
+argument drops the last non-full batch of each worker's iterable-style dataset
+replica.
+Workers are shut down once the end of the iteration is reached, or when the
+iterator becomes garbage collected.
+.. warning::
+  It is generally not recommended to return CUDA tensors in multi-process
+  loading because of many subtleties in using CUDA and sharing CUDA tensors in
+  multiprocessing. Instead, we recommend
+  using `automatic memory pinning <Memory Pinning_>`_ (i.e., setting
+  :attr:`pin_memory=True`), which enables fast data transfer to CUDA-enabled
+  GPUs.
+Platform-specific behaviors
+"""""""""""""""""""""""""""
+Since workers rely on Python :py:mod:`multiprocessing`, worker launch behavior is
+different on Windows compared to Unix.
+* On Unix, :func:`fork()` is the default :py:mod:`multiprocessing` start method.
+  Using :func:`fork`, child workers typically can access the :attr:`dataset` and
+  Python argument functions directly through the cloned address space.
+* On Windows or MacOS, :func:`spawn()` is the default :py:mod:`multiprocessing` start method.
+  Using :func:`spawn()`, another interpreter is launched which runs your main script,
+  followed by the internal worker function that receives the :attr:`dataset`,
+  :attr:`collate_fn` and other arguments through :py:mod:`pickle` serialization.
+This separate serialization means that you should take two steps to ensure you
+are compatible with Windows while using multi-process data loading:
+- Wrap most of you main script's code within ``if __name__ == '__main__':`` block,
+  to make sure it doesn't run again (most likely generating error) when each worker
+  process is launched. You can place your dataset and :class:`~oneflow.utils.data.DataLoader`
+  instance creation logic here, as it doesn't need to be re-executed in workers.
+- Make sure that any custom :attr:`collate_fn`, :attr:`worker_init_fn`
+  or :attr:`dataset` code is declared as top level definitions, outside of the
+  ``__main__`` check. This ensures that they are available in worker processes.
+  (this is needed since functions are pickled as references only, not ``bytecode``.)
+.. _data-loading-randomness:
+Randomness in multi-process data loading
+""""""""""""""""""""""""""""""""""""""""""
+By default, each worker will have its Oneflow seed set to ``base_seed + worker_id``,
+where ``base_seed`` is a long generated by main process using its RNG (thereby,
+consuming a RNG state mandatorily) or a specified :attr:`generator`. However, seeds for other
+libraries may be duplicated upon initializing workers, causing each worker to return
+identical random numbers.
+In :attr:`worker_init_fn`, you may access the Oneflow seed set for each worker
+with :func:`oneflow.initial_seed()`, and use it to seed other libraries before data
+loading.
+Memory Pinning
+--------------
+Host to GPU copies are much faster when they originate from pinned (page-locked)
+memory. See `cuda-memory-pinning` for more details on when and how to use
+pinned memory generally.
+For data loading, passing :attr:`pin_memory=True` to a
+:class:`~oneflow.utils.data.DataLoader` will automatically put the fetched data
+Tensors in pinned memory, and thus enables faster data transfer to CUDA-enabled
+GPUs.
+The default memory pinning logic only recognizes Tensors and maps and iterables
+containing Tensors.  By default, if the pinning logic sees a batch that is a
+custom type (which will occur if you have a :attr:`collate_fn` that returns a
+custom batch type), or if each element of your batch is a custom type, the
+pinning logic will not recognize them, and it will return that batch (or those
+elements) without pinning the memory.  To enable memory pinning for custom
+batch or data type(s), define a :meth:`pin_memory` method on your custom
+type(s).
+See the example below.
+Example::
+    class SimpleCustomBatch:
+        def __init__(self, data):
+            transposed_data = list(zip(*data))
+            self.inp = oneflow.stack(transposed_data[0], 0)
+            self.tgt = oneflow.stack(transposed_data[1], 0)
+        # custom memory pinning method on custom type
+        def pin_memory(self):
+            self.inp = self.inp.pin_memory()
+            self.tgt = self.tgt.pin_memory()
+            return self
+    def collate_wrapper(batch):
+        return SimpleCustomBatch(batch)
+    inps = oneflow.arange(10 * 5, dtype=oneflow.float32).view(10, 5)
+    tgts = oneflow.arange(10 * 5, dtype=oneflow.float32).view(10, 5)
+    dataset = TensorDataset(inps, tgts)
+    loader = DataLoader(dataset, batch_size=2, collate_fn=collate_wrapper,
+                        pin_memory=True)
+    for batch_ndx, sample in enumerate(loader):
+        print(sample.inp.is_pinned())
+        print(sample.tgt.is_pinned())
+.. autoclass:: DataLoader
+.. autoclass:: Dataset
+.. autoclass:: IterableDataset
+.. autoclass:: TensorDataset
+.. autoclass:: ConcatDataset
+.. autoclass:: Subset
+.. autofunction:: oneflow.utils.data.random_split
+.. autoclass:: oneflow.utils.data.Sampler
+.. autoclass:: oneflow.utils.data.SequentialSampler
+.. autoclass:: oneflow.utils.data.RandomSampler
+.. autoclass:: oneflow.utils.data.SubsetRandomSampler
+.. autoclass:: oneflow.utils.data.BatchSampler
+.. autoclass:: oneflow.utils.data.distributed.DistributedSampler
--- a/docs/source/utils.global_view.rst
+++ b/docs/source/utils.global_view.rst
+oneflow.utils.global_view
+======================================
+Some global view Ops
+--------------------------------------
+.. currentmodule:: oneflow.utils.global_view
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    to_global
+    to_local
+    global_mode
+    current_global_mode
--- a/docs/source/utils.rst
+++ b/docs/source/utils.rst
-oneflow.utils
-===================================
-Utils
----------------------------------
-.. currentmodule:: oneflow.utils
-.. automodule:: oneflow.utils.data
-    :members: DataLoader,
-        Dataset,
-        IterableDataset,
-        TensorDataset,
-        ConcatDataset,
-        Subset,
-        random_split,
-        Sampler,
-        SequentialSampler,
-        RandomSampler,
-        SubsetRandomSampler,
-        BatchSampler
-.. currentmodule:: oneflow.utils
-.. automodule:: oneflow.utils.data.distributed
-    :members: DistributedSampler
-.. autofunction:: oneflow.utils.from_torch
-.. autofunction:: oneflow.utils.to_torch
--- a/docs/source/utils.tensor.rst
+++ b/docs/source/utils.tensor.rst
+oneflow.utils.tensor
+==========================================================
+Some torch-related Ops are suitable for tensor conversion.
+----------------------------------------------------------
+.. currentmodule:: oneflow.utils.tensor
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    from_torch
+    to_torch
--- a/external/CMakeLists.txt
+++ b/external/CMakeLists.txt
@@ -15,4 +15,5 @@ add_subdirectory(kineto)
 list(APPEND EXTERNAL_TARGETS kineto)
 mark_targets_as_system(${EXTERNAL_TARGETS})
 set_property(GLOBAL PROPERTY EXTERNAL_TARGETS ${EXTERNAL_TARGETS})
--- a/external/kineto/CMakeLists.txt
+++ b/external/kineto/CMakeLists.txt
@@ -34,7 +34,9 @@ list(
  $ENV{CUPTI_ROOT}/lib
  /usr/lib
  ${CUDA_SOURCE_DIR}/targets/x86_64-linux/lib64
-  ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64)
+  ${CUDA_SOURCE_DIR}/targets/x86_64-linux/lib
+  ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64
+  ${CUDA_SOURCE_DIR}/extras/CUPTI/lib)
 find_library(
  CUDA_cupti_LIBRARY

--- a/oneflow/api/common/ofblob.h
+++ b/oneflow/api/common/ofblob.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_API_COMMON_OFBLOB_H_
-#define ONEFLOW_API_COMMON_OFBLOB_H_
-#include "oneflow/core/common/just.h"
-#include "oneflow/core/register/ofblob.h"
-namespace oneflow {
-template<typename T>
-struct BlobBufferCopyUtil {
-  static Maybe<void> From(uint64_t of_blob_ptr, const T* buf_ptr, size_t size) {
-    auto* of_blob = reinterpret_cast<OfBlob*>(of_blob_ptr);
-    of_blob->AutoMemCopyFrom<T>(buf_ptr, size);
-    return Maybe<void>::Ok();
-  }
-  static Maybe<void> To(uint64_t of_blob_ptr, T* buf_ptr, size_t size) {
-    auto* of_blob = reinterpret_cast<OfBlob*>(of_blob_ptr);
-    of_blob->AutoMemCopyTo<T>(buf_ptr, size);
-    return Maybe<void>::Ok();
-  }
-};
-template<>
-struct BlobBufferCopyUtil<void> {
-  static Maybe<void> From(uint64_t of_blob_ptr, const void* buf_ptr, size_t size) {
-    auto* of_blob = reinterpret_cast<OfBlob*>(of_blob_ptr);
-    of_blob->AutoMemCopyFrom<void>(buf_ptr, size);
-    return Maybe<void>::Ok();
-  }
-  static Maybe<void> To(uint64_t of_blob_ptr, void* buf_ptr, size_t size) {
-    auto* of_blob = reinterpret_cast<OfBlob*>(of_blob_ptr);
-    of_blob->AutoMemCopyTo<void>(buf_ptr, size);
-    return Maybe<void>::Ok();
-  }
-};
-}  // namespace oneflow
-#endif  // !ONEFLOW_API_COMMON_OFBLOB_H_
--- a/oneflow/api/common/sbp.h
+++ b/oneflow/api/common/sbp.h
@@ -26,7 +26,9 @@ namespace oneflow {
 namespace api {
-inline Maybe<std::string> SbpToString(Symbol<SbpParallel> sbp_sym) {
+// NOTE: The api inferface will print the whole name of sbp.
+inline Maybe<std::string> ApiSbpToString(Symbol<SbpParallel> sbp_sym) {
  std::string sbp_str = "oneflow.sbp.";
  if (sbp_sym->has_broadcast_parallel()) {
    sbp_str += "broadcast";
@@ -40,11 +42,11 @@ inline Maybe<std::string> SbpToString(Symbol<SbpParallel> sbp_sym) {
  return sbp_str;
 }
-inline Maybe<std::string> NdSbpToString(Symbol<NdSbp> nd_sbp) {
+inline Maybe<std::string> ApiNdSbpToString(Symbol<NdSbp> nd_sbp) {
  std::string str = "(";
  for (int i = 0; i < nd_sbp->sbp_parallel_size(); ++i) {
    if (i > 0) { str += ", "; }
-    str += *JUST(SbpToString(SymbolOf(nd_sbp->sbp_parallel(i))));
+    str += *JUST(ApiSbpToString(SymbolOf(nd_sbp->sbp_parallel(i))));
  }
  if (nd_sbp->sbp_parallel_size() == 1) { str += ","; }
  str += ")";