0.9.1-rocm

a715222c · yuguo · f262efc9 · a715222c · a715222c · a715222c
Commit a715222c authored Feb 28, 2023 by yuguo
20 changed files
--- a/cmake/oneflow.cmake
+++ b/cmake/oneflow.cmake
@@ -45,7 +45,7 @@ file(
  "${PROJECT_SOURCE_DIR}/oneflow/user/*.*"
  "${PROJECT_SOURCE_DIR}/oneflow/api/*.*"
  "${PROJECT_SOURCE_DIR}/oneflow/maybe/*.*"
-  "${PROJECT_SOURCE_DIR}/oneflow/extension/python/*.*")
+  "${PROJECT_SOURCE_DIR}/oneflow/extension/*.*")
 foreach(oneflow_single_file ${oneflow_all_src})
  # Verify whether this file is for other platforms
@@ -80,6 +80,21 @@ foreach(oneflow_single_file ${oneflow_all_src})
    if(BUILD_CUDA)
      list(APPEND of_all_obj_cc ${oneflow_single_file})
    endif()
+    if(BUILD_ROCM)
+      if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/(core|user)/.*\\.cu$")
+        get_filename_component(oneflow_single_file_hip_cpp_dir ${oneflow_single_file} DIRECTORY)
+        get_filename_component(oneflow_single_file_hip_cpp ${oneflow_single_file} NAME_WE)
+        add_custom_command(
+          OUTPUT "${oneflow_single_file_hip_cpp_dir}/${oneflow_single_file_hip_cpp}_hip.cpp" 
+          COMMAND ${CMAKE_COMMAND} -E copy "${oneflow_single_file}" "${oneflow_single_file_hip_cpp_dir}/${oneflow_single_file_hip_cpp}_hip.cpp" 
+          DEPENDS "${oneflow_single_file}" 
+        ) 
+        list(APPEND of_all_obj_cc ${oneflow_single_file_hip_cpp_dir}/${oneflow_single_file_hip_cpp}_hip.cpp)
+      endif()
+      if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/(core|user)/.*\\.cuh$")
+        list(APPEND of_all_obj_cc ${oneflow_single_file})
+      endif()
+    endif()
    set(group_this ON)
  endif()
@@ -96,8 +111,7 @@ foreach(oneflow_single_file ${oneflow_all_src})
      set(group_this ON)
    endif()
-    if("${oneflow_single_file}" MATCHES
+    if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/extension/.*\\.(c|h|cpp)$")
-       "^${PROJECT_SOURCE_DIR}/oneflow/extension/python/.*\\.(h|cpp)$")
      list(APPEND of_pyext_obj_cc ${oneflow_single_file})
      set(group_this ON)
    endif()
@@ -105,7 +119,7 @@ foreach(oneflow_single_file ${oneflow_all_src})
  if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/(core|user|maybe)/.*\\.cpp$")
    if("${oneflow_single_file}" MATCHES
-       "^${PROJECT_SOURCE_DIR}/oneflow/(core|user|maybe)/.*_test\\.cpp$")
+       "^${PROJECT_SOURCE_DIR}/oneflow/(core|user|maybe|thread)/.*_test\\.cpp$")
      # test file
      list(APPEND of_all_test_cc ${oneflow_single_file})
    elseif(APPLE AND "${oneflow_single_file}" MATCHES
@@ -136,6 +150,7 @@ add_custom_target(
  of_format
  COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_license_format.py -i
          ${CMAKE_CURRENT_SOURCE_DIR}/oneflow --fix
+          --exclude="oneflow/user/kernels/fmha_flash_attention"
  COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_license_format.py -i
          ${ONEFLOW_PYTHON_DIR} --fix --exclude="oneflow/include" --exclude="oneflow/core"
  COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_clang_format.py --source_dir
@@ -254,20 +269,22 @@ if("${LLVM_MONO_REPO_URL}" STREQUAL
      "https://github.com/llvm/llvm-project/archive/6a9bbd9f20dcd700e28738788bb63a160c6c088c.zip"
   OR "${LLVM_MONO_REPO_URL}" STREQUAL
      "https://github.com/llvm/llvm-project/archive/32805e60c9de1f82887cd2af30d247dcabd2e1d3.zip"
+   OR "${LLVM_MONO_REPO_URL}" STREQUAL
+      "https://github.com/llvm/llvm-project/archive/6d6268dcbf0f48e43f6f9fe46b3a28c29ba63c7d.zip"
   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "f2f17229cf21049663b8ef4f2b6b8062"
   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "6b7c6506d5922de9632c8ff012b2f945"
   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e0ea669a9f0872d35bffda5ec6c5ac6f"
   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "241a333828bba1efa35aff4c4fc2ce87"
   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "075fbfdf06cb3f02373ea44971af7b03"
-   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e412dc61159b5e929b0c94e44b11feb2")
+   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e412dc61159b5e929b0c94e44b11feb2"
+   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "334997b4879aba15d9323a732356cf2a")
  unset(LLVM_MONO_REPO_URL CACHE)
  unset(LLVM_MONO_REPO_MD5 CACHE)
 endif()
-set(LLVM_MONO_REPO_URL
+set(LLVM_MONO_REPO_URL "https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-15.0.6.zip"
-    "https://github.com/llvm/llvm-project/archive/6d6268dcbf0f48e43f6f9fe46b3a28c29ba63c7d.zip"
    CACHE STRING "")
 use_mirror(VARIABLE LLVM_MONO_REPO_URL URL ${LLVM_MONO_REPO_URL})
-set(LLVM_MONO_REPO_MD5 "334997b4879aba15d9323a732356cf2a" CACHE STRING "")
+set(LLVM_MONO_REPO_MD5 "1ccc00accc87a1a5d42a275d6e31cd8c" CACHE STRING "")
 set(ONEFLOW_BUILD_ROOT_DIR "${PROJECT_BINARY_DIR}")
 add_subdirectory(${PROJECT_SOURCE_DIR}/oneflow/ir)
 if(WITH_MLIR)
@@ -306,9 +323,9 @@ elseif(UNIX)
    ${oneflow_third_party_libs}
    ${EXTERNAL_TARGETS}
    -Wl,--no-whole-archive
+    -Wl,--as-needed
    -ldl
-    -lrt
+    -lrt)
-    -Wl,--version-script ${PROJECT_SOURCE_DIR}/version_script.lds)
  if(BUILD_CUDA)
    target_link_libraries(oneflow CUDA::cudart_static)
  endif()
@@ -317,6 +334,43 @@ elseif(WIN32)
  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /WHOLEARCHIVE:oneflow")
 endif()
+if (BUILD_ROCM)
+  # AMD compiler fails to compile these three files with '-O1/2/3'.
+  # The value of `COMPILE_OPTIONS` target property is added after CMAKE_<LANG>_FLAGS_<CONFIG>,
+  # so '-O0' will override '-O1/2/3'.
+  set_source_files_properties(${PROJECT_SOURCE_DIR}/oneflow/user/kernels/median_with_indices_kernel_hip.cpp
+                              ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/radix_sort_top_k_kernel_hip.cpp
+                              ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/arg_sort_kernel_hip.cpp
+                              #${PROJECT_SOURCE_DIR}/oneflow/core/ep/cuda/primitive/broadcast_elementwise_binary_math1_hip.cpp
+                              PROPERTIES COMPILE_OPTIONS "-O0")
+endif()
+if(BUILD_CUDA)
+  string(JOIN "," CUDA_REAL_ARCHS ${CUDA_REAL_ARCHS_LIST})
+  set_source_files_properties(${PROJECT_SOURCE_DIR}/oneflow/core/hardware/cuda_device_descriptor.cpp
+                              PROPERTIES COMPILE_FLAGS "-DCUDA_REAL_ARCHS=\"${CUDA_REAL_ARCHS}\"")
+endif()
+if(BUILD_CUDA AND WITH_CUTLASS)
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "10.1")
+    add_definitions(-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1)
+  endif()
+  set_property(
+    SOURCE ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/fused_multi_head_attention_inference_kernel.cu
+    APPEND PROPERTY INCLUDE_DIRECTORIES
+                    ${CUTLASS_INSTALL_DIR}/examples/41_fused_multi_head_attention)
+  set_property(SOURCE ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/fused_glu_kernel.cu APPEND
+               PROPERTY INCLUDE_DIRECTORIES ${CUTLASS_INSTALL_DIR}/examples/45_dual_gemm)
+  if("${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA")
+    set_property(
+      SOURCE
+        ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/fused_multi_head_attention_inference_kernel.cu
+      APPEND
+      PROPERTY COMPILE_OPTIONS "--use_fast_math")
+  endif()
+endif()
 # oneflow api common
 if(BUILD_PYTHON OR BUILD_CPP_API)
  file(GLOB_RECURSE of_api_common_files ${PROJECT_SOURCE_DIR}/oneflow/api/common/*.h
@@ -343,6 +397,8 @@ if(BUILD_PYTHON)
  add_dependencies(of_pyext_obj oneflow)
  pybind11_add_module(oneflow_internal ${PYBIND11_SRCS} ${of_pybind_obj_cc} ${PYBIND_REGISTRY_CC})
+  set_property(TARGET oneflow_internal APPEND PROPERTY BUILD_RPATH "\$ORIGIN/../nvidia/cublas/lib")
+  set_property(TARGET oneflow_internal APPEND PROPERTY BUILD_RPATH "\$ORIGIN/../nvidia/cudnn/lib")
  set_compile_options_to_oneflow_target(oneflow_internal)
  set_property(TARGET oneflow_internal PROPERTY CXX_VISIBILITY_PRESET "default")
  add_dependencies(oneflow_internal of_functional_obj of_functional_tensor_obj of_op_schema)
@@ -419,6 +475,9 @@ if(BUILD_TESTING)
    oneflow_add_test(oneflow_testexe SRCS ${of_all_test_cc} TEST_NAME oneflow_test)
    target_link_libraries(oneflow_testexe ${of_libs} ${oneflow_third_party_libs} glog::glog
                          ${oneflow_test_libs})
+    if(WITH_MLIR)
+      target_link_libraries(oneflow_testexe MLIROneFlowExtension)
+    endif()
  endif()
  if(BUILD_CPP_API)
@@ -524,6 +583,10 @@ if(BUILD_CPP_API)
  if(BUILD_CUDA)
    checkdirandappendslash(DIR ${NCCL_LIBRARY_DIR} OUTPUT NCCL_LIBRARY_DIR_APPENDED)
    list(APPEND LIBONEFLOW_THIRD_PARTY_DIRS ${NCCL_LIBRARY_DIR_APPENDED})
+    if(WITH_CUTLASS)
+      checkdirandappendslash(DIR ${CUTLASS_LIBRARY_DIR} OUTPUT CUTLASS_LIBRARY_DIR_APPENDED)
+      list(APPEND LIBONEFLOW_THIRD_PARTY_DIRS ${CUTLASS_LIBRARY_DIR_APPENDED})
+    endif()
  endif()
  install(
@@ -555,6 +618,7 @@ if(BUILD_CPP_API)
    llvm-PerfectShuffle
    llvm-tblgen
    mlir-tblgen
+    mlir-pdll
    obj2yaml
    oneflow_tblgen
    yaml-bench

--- a/cmake/op_schema.cmake
+++ b/cmake/op_schema.cmake
@@ -38,7 +38,13 @@ set(ONEFLOW_OP_GROUPS
    "TRIGONOMETRIC"
    "UNARY"
    "UPSAMPLE"
-    "ONE_EMBEDDING")
+    "ONE_EMBEDDING"
+    "LINEAR_ALGEBRA"
+    "SYSTEM")
+if(WITH_MLIR)
+  list(APPEND ONEFLOW_OP_GROUPS "MLIR_JIT")
+endif(WITH_MLIR)
 foreach(OP_GROUP_NAME IN LISTS ONEFLOW_OP_GROUPS)
  list(APPEND ONEFLOW_SCHEMA_TABLEGEN_FLAGS "-DGET_ONEFLOW_${OP_GROUP_NAME}_OP_DEFINITIONS")
 endforeach()

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -137,7 +137,8 @@ endif()
 list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${RE2_INCLUDE_DIR})
 if(BUILD_CUDA)
-  if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
+  # Always use third_party/cub for Clang CUDA in case of compatibility issues
+  if("${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA" AND CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
    if(CMAKE_CXX_STANDARD LESS 14)
      add_definitions(-DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT)
      add_definitions(-DCUB_IGNORE_DEPRECATED_CPP11)
@@ -150,6 +151,7 @@ if(BUILD_CUDA)
    list(APPEND oneflow_third_party_dependencies cub_copy_headers_to_destination)
  endif()
  include(nccl)
+  include(cutlass)
  list(APPEND oneflow_third_party_libs ${NCCL_LIBRARIES})
  list(APPEND oneflow_third_party_libs ${CUDNN_LIBRARIES})
@@ -159,12 +161,19 @@ if(BUILD_CUDA)
  list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUDNN_INCLUDE_DIRS} ${CUB_INCLUDE_DIR}
       ${NCCL_INCLUDE_DIR})
-endif()
+  if(WITH_CUTLASS)
+    list(APPEND oneflow_third_party_dependencies cutlass)
+    list(APPEND oneflow_third_party_dependencies cutlass_copy_examples_to_destination)
+    list(APPEND oneflow_third_party_libs ${CUTLASS_LIBRARIES})
+    list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUTLASS_INCLUDE_DIR})
+  endif()
+endif()
 if (BUILD_ROCM)
  # Find rocm packages
  find_package(hip)
+  find_package(hipfft)
  find_package(hipblas)
  find_package(hipcub)
  find_package(hiprand)
@@ -173,11 +182,31 @@ if (BUILD_ROCM)
  find_package(rccl)
  add_definitions(-DWITH_ROCM)
  add_definitions(-D__HIP_PLATFORM_HCC__)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
+  add_definitions(-D__HIPCC__)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
+  if (BUILD_ROCM_GRAPHS)
+    add_definitions(-DWITH_ROCM_GRAPHS)
+  endif()
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ -D__HIPCC__")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ -D__HIPCC__")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --gpu-max-threads-per-block=1024")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-macro-redefined")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-inconsistent-missing-override")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-exceptions")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-shift-count-negative")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-shift-count-overflow")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-duplicate-decl-specifier")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-implicit-int-float-conversion")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pass-failed")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-gpu-rdc")
  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mcmodel=large")
  set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -mcmodel=large")
  list(APPEND oneflow_third_party_libs hip::device)
+  list(APPEND oneflow_third_party_libs hip::hipfft)
  list(APPEND oneflow_third_party_libs roc::hipblas)
  list(APPEND oneflow_third_party_libs hip::hipcub)
  list(APPEND oneflow_third_party_libs roc::rocrand)
@@ -185,17 +214,18 @@ if (BUILD_ROCM)
  list(APPEND oneflow_third_party_libs MIOpen)
  link_directories(${ROCM_PATH}/rccl/lib)
  list(APPEND oneflow_third_party_libs rccl)
-  list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${HIP_INCLUDE_DIRS} 
+  list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${HIP_INCLUDE_DIRS}
+                                               ${HIPFFT_INCLUDE_DIRS}
                                               ${HIPBLAS_INCLUDE_DIRS}
                                               ${HIPCUB_INCLUDE_DIRS}
                                               "${ROCM_PATH}/hiprand/include"
                                               "${ROCM_PATH}/rocrand/include"
+                                               "${ROCM_PATH}/roctracer/include"
                                               ${MIOPEN_INCLUDE_DIRS}
                                               ${RCCL_INCLUDE_DIRS})
  message(STATUS "ONEFLOW_THIRD_PARTY_INCLUDE_DIRS: ${ONEFLOW_THIRD_PARTY_INCLUDE_DIRS}")
 endif()
 if(BUILD_RDMA)
  if(UNIX)
    include(CheckIncludeFiles)

--- a/cmake/third_party/cutlass.cmake
+++ b/cmake/third_party/cutlass.cmake
+include(ExternalProject)
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+  set(WITH_CUTLASS_INIT OFF)
+else()
+  set(WITH_CUTLASS_INIT ON)
+endif()
+set(WITH_CUTLASS ${WITH_CUTLASS_INIT} CACHE BOOL "")
+if(WITH_CUTLASS)
+  add_definitions(-DWITH_CUTLASS)
+  find_package(Threads)
+  set(CUTLASS_PROJECT cutlass)
+  set(CUTLASS_INSTALL_DIR ${THIRD_PARTY_DIR}/cutlass)
+  set(CUTLASS_INCLUDE_DIR ${CUTLASS_INSTALL_DIR}/include CACHE PATH "" FORCE)
+  set(CUTLASS_LIBRARY_DIR ${CUTLASS_INSTALL_DIR}/lib CACHE PATH "" FORCE)
+  set(CUTLASS_LIBRARIES ${CUTLASS_LIBRARY_DIR}/libcutlass.so)
+  set(CUTLASS_SOUREC_DIR ${CMAKE_CURRENT_BINARY_DIR}/cutlass/src/cutlass/)
+  foreach(arch ${CUDA_REAL_ARCHS_LIST})
+    if(arch GREATER_EQUAL 70)
+      list(APPEND CUTLASS_REAL_ARCHS ${arch})
+    endif()
+  endforeach()
+  if(THIRD_PARTY)
+    ExternalProject_Add(
+      ${CUTLASS_PROJECT}
+      PREFIX cutlass
+      URL ${CUTLASS_URL}
+      URL_MD5 ${CUTLASS_MD5}
+      UPDATE_COMMAND ""
+      BUILD_BYPRODUCTS ${CUTLASS_LIBRARIES}
+      CMAKE_ARGS -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+                 -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
+                 -DCMAKE_CXX_FLAGS_DEBUG:STRING=${CMAKE_CXX_FLAGS_DEBUG}
+                 -DCMAKE_CXX_FLAGS_RELEASE:STRING=${CMAKE_CXX_FLAGS_RELEASE}
+      CMAKE_CACHE_ARGS
+        -DCMAKE_CUDA_COMPILER:STRING=${CUDAToolkit_NVCC_EXECUTABLE}
+        -DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER}
+        -DCMAKE_CXX_COMPILER_LAUNCHER:STRING=${CMAKE_CXX_COMPILER_LAUNCHER}
+        -DCMAKE_INSTALL_PREFIX:PATH=${CUTLASS_INSTALL_DIR}
+        -DCMAKE_INSTALL_LIBDIR:PATH=${CUTLASS_LIBRARY_DIR}
+        -DCMAKE_INSTALL_MESSAGE:STRING=${CMAKE_INSTALL_MESSAGE}
+        -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+        -DCUTLASS_LIBRARY_OPERATIONS:STRING=conv2d
+        -DCUTLASS_LIBRARY_KERNELS:STRING=simt_hfprop_*,tensorop_f16_*fprop,tensorop_h*fprop
+        -DCUTLASS_ENABLE_EXAMPLES:BOOL=OFF
+        -DCUTLASS_ENABLE_PROFILER:BOOL=OFF
+        -DCUTLASS_ENABLE_LIBRARY:BOOL=ON
+        -DCUTLASS_NVCC_ARCHS:STRING=${CUTLASS_REAL_ARCHS}
+        -DCUTLASS_ENABLE_TESTS:BOOL=OFF
+        -DCUTLASS_UNITY_BUILD_ENABLED:BOOL=ON
+        -DCUTLASS_LIBRARY_DEBUG_POSTFIX:STRING=
+        -DCUTLASS_NVCC_EMBED_PTX:BOOL=OFF)
+    add_custom_target(cutlass_copy_examples_to_destination DEPENDS cutlass)
+    set(CUTLASS_SOURCE_EXAMPLES_DIR ${CUTLASS_SOUREC_DIR}/examples)
+    set(CUTLASS_INSTALL_EXAMPLES_FILES
+        "41_fused_multi_head_attention/iterators/make_residual_last.h"
+        "41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h"
+        "41_fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h"
+        "41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h"
+        "41_fused_multi_head_attention/mma_from_smem.h"
+        "41_fused_multi_head_attention/epilogue_rescale_output.h"
+        "41_fused_multi_head_attention/attention_scaling_coefs_updater.h"
+        "41_fused_multi_head_attention/gemm_kernel_utils.h"
+        "41_fused_multi_head_attention/fmha_grouped_problem_visitor.h"
+        "41_fused_multi_head_attention/fmha_grouped.h"
+        "41_fused_multi_head_attention/default_fmha_grouped.h"
+        "41_fused_multi_head_attention/epilogue_pipelined.h"
+        "41_fused_multi_head_attention/epilogue_thread_apply_logsumexp.h"
+        "41_fused_multi_head_attention/kernel_forward.h"
+        "41_fused_multi_head_attention/gemm/custom_mma_multistage.h"
+        "41_fused_multi_head_attention/gemm/custom_mma_base.h"
+        "41_fused_multi_head_attention/gemm/custom_mma.h"
+        "41_fused_multi_head_attention/gemm/custom_mma_pipelined.h"
+        "41_fused_multi_head_attention/find_default_mma.h"
+        "41_fused_multi_head_attention/debug_utils.h"
+        "45_dual_gemm/test_run.h"
+        "45_dual_gemm/kernel/dual_gemm.h"
+        "45_dual_gemm/device/dual_gemm.h"
+        "45_dual_gemm/dual_gemm_run.h"
+        "45_dual_gemm/thread/left_silu_and_mul.h"
+        "45_dual_gemm/threadblock/dual_mma_multistage.h"
+        "45_dual_gemm/threadblock/dual_epilogue.h"
+        "45_dual_gemm/threadblock/dual_mma_base.h")
+    foreach(filename ${CUTLASS_INSTALL_EXAMPLES_FILES})
+      add_custom_command(
+        TARGET cutlass_copy_examples_to_destination
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CUTLASS_SOURCE_EXAMPLES_DIR}/${filename}
+                ${CUTLASS_INSTALL_DIR}/examples/${filename})
+    endforeach()
+  endif(THIRD_PARTY)
+endif(WITH_CUTLASS)
--- a/cmake/third_party/eigen.cmake
+++ b/cmake/third_party/eigen.cmake
@@ -7,12 +7,9 @@ set(EIGEN_URL https://github.com/Oneflow-Inc/eigen-git-mirror/archive/refs/tags/
 set(EIGEN_MD5 a23cb70e12d1bf9b09cb28af51bc26ae)
 use_mirror(VARIABLE EIGEN_URL URL ${EIGEN_URL})
-add_definitions(-DEIGEN_NO_AUTOMATIC_RESIZING)
 if(BUILD_CUDA)
  add_definitions(-DEIGEN_USE_GPU)
 endif()
-add_definitions(-DEIGEN_NO_MALLOC)
-#add_definitions(-DEIGEN_NO_AUTOMATIC_RESIZING -DEIGEN_NO_MALLOC -DEIGEN_USE_GPU)
 if(THIRD_PARTY)

--- a/cmake/third_party/nccl.cmake
+++ b/cmake/third_party/nccl.cmake
@@ -34,11 +34,36 @@ else()
  set(NCCL_INCLUDE_DIR ${NCCL_INSTALL_DIR}/include)
  set(NCCL_LIBRARY_DIR ${NCCL_INSTALL_DIR}/lib)
-  set(NCCL_URL https://github.com/NVIDIA/nccl/archive/refs/tags/v2.12.10-1.tar.gz)
+  # Versions 2.13 and above may cause deadlocks
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
+    set(NCCL_URL https://github.com/NVIDIA/nccl/archive/refs/tags/v2.15.1-1.tar.gz)
+    set(NCCL_MD5 37b787ff8934cd9374b4612f663c17fa)
+  else()
+    set(NCCL_URL https://github.com/NVIDIA/nccl/archive/refs/tags/v2.12.10-1.tar.gz)
+    set(NCCL_MD5 bdb91f80b78c99831f09ca8bb28a1032)
+  endif()
  use_mirror(VARIABLE NCCL_URL URL ${NCCL_URL})
  list(APPEND NCCL_LIBRARIES ${NCCL_LIBRARY_DIR}/${NCCL_LIBRARY_NAME})
+  set(NCCL_ARCHS_LIST ${CUDA_REAL_ARCHS_LIST})
+  # remove redundant archs, https://github.com/NVIDIA/nccl/blob/cb111f764a6d46370f24f75101d6b219bb2dda54/makefiles/common.mk#L28
+  if("70" IN_LIST NCCL_ARCHS_LIST AND "75" IN_LIST NCCL_ARCHS_LIST)
+    list(REMOVE_ITEM NCCL_ARCHS_LIST "75")
+  endif()
+  if("80" IN_LIST NCCL_ARCHS_LIST AND "86" IN_LIST NCCL_ARCHS_LIST)
+    list(REMOVE_ITEM NCCL_ARCHS_LIST "86")
+  endif()
+  if("80" IN_LIST NCCL_ARCHS_LIST AND "89" IN_LIST NCCL_ARCHS_LIST)
+    list(REMOVE_ITEM NCCL_ARCHS_LIST "89")
+  endif()
+  foreach(arch ${NCCL_ARCHS_LIST})
+    string(APPEND NCCL_GENCODE "-gencode=arch=compute_${arch},code=sm_${arch} ")
+  endforeach()
  if(THIRD_PARTY)
    include(ProcessorCount)
@@ -47,11 +72,12 @@ else()
      nccl
      PREFIX nccl
      URL ${NCCL_URL}
-      URL_MD5 bdb91f80b78c99831f09ca8bb28a1032
+      URL_MD5 ${NCCL_MD5}
      UPDATE_COMMAND ""
      CONFIGURE_COMMAND ""
      BUILD_IN_SOURCE 1
      BUILD_COMMAND make -j${PROC_NUM} src.build CUDA_HOME=${CUDATOOLKIT_BIN_ROOT}
+                    NVCC_GENCODE=${NCCL_GENCODE}
      INSTALL_COMMAND make src.install PREFIX=${NCCL_INSTALL_DIR}
      BUILD_BYPRODUCTS ${NCCL_LIBRARIES})

--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -14,3 +14,4 @@ dataclasses; python_version<"3.7"
 cmakelang==0.6.13
 pytest-xdist
 rich
+portalocker
--- a/docs/source/auto_parallel.rst
+++ b/docs/source/auto_parallel.rst
+Auto Parallelism
+====================================================
+As the scale of deep-learning models grows larger and larger, distributed training,
+or parallelism, is needed. Data parallelism and model parallelism has been designed
+to speed up the training and solve memory issues.
+In oneflow, SBP signature enables users to configure parallelism policy easily.
+However, users still need to specify the SBP property for each operator, or most of them.
+Users might spend a couple of days digging into the detail of parallelism and get a
+low throughput just because of a slight mistake in the configuration of SBP signature.
+.. note::
+   It only works on :doc:`graph` mode.
+Our strength
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+To get rid of all those configurations for SBP signatures, we developed auto parallelism.
+Still, configurations of placement are necessary and we have not supported auto placement
+yet. If you read this paragraph before you rush into any SBP stuff, then congratulation,
+you do not need to learn SBPs. You can start writing your code as you did under CPU mode.
+Our auto parallelism would generate a fast strategy customized for your specific models,
+the size of parameters, and the number of available GPUs.
+How to use auto parallelism?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+You just need to simply enable the configuration settings in the model
+of :doc:`graph` .
+Example::
+    import oneflow as flow
+    class SubclassGraph(flow.nn.Graph):
+        def __init__(self):
+            super().__init__() # MUST be called
+            # auto parallelism configuration
+            self.config.enable_auto_parallel(True)
+            # other configurations about auto parallelism
+            # ......
+        def build(self):
+            pass
+.. warning::
+   If you enable auto parallelism, OneFlow will take care of the SBP configurations
+   of operators except for explicit ``to_global`` functions.
+Configuration API for auto parallelism
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. currentmodule:: oneflow.nn.graph.graph_config.GraphConfig
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    enable_auto_parallel
+    enable_auto_parallel_ignore_user_sbp_config
+    set_auto_parallel_computation_cost_ratio
+    set_auto_parallel_wait_time
+    enable_auto_parallel_trunk_algo
+    enable_auto_parallel_sbp_collector
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
 oneflow.autograd
-================================================
+====================================================
-Functions and classes for autograd.
---------------------------------------------------
+.. The documentation is referenced from:
+   https://pytorch.org/docs/1.10/autograd.html
+``oneflow.autograd`` provides classes and functions implementing automatic differentiation of arbitrary scalar 
+valued functions. It requires minimal changes to the existing code - you only need to declare ``Tensor`` s 
+for which gradients should be computed with the ``requires_grad=True`` keyword. As of now, we only support 
+autograd for floating point ``Tensor`` types ( half, float, double and bfloat16).
 .. currentmodule:: oneflow.autograd
-.. autoclass:: oneflow.autograd.Function
-    :members: apply,
-    :special-members: __call__,
-.. automodule:: oneflow.autograd
+.. autosummary::
-    :members: grad,
+    :toctree: generated
-      backward,
+    :nosignatures:
+    backward
+    grad
+Locally disabling gradient computation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    no_grad
+    enable_grad
+    set_grad_enabled
+    inference_mode
+.. TODO(wyg): uncomment this after aligning accumulate grad
+.. Default gradient layouts
+.. ^^^^^^^^^^^^^^^^^^^^^^^^
+.. A ``param.grad`` is accumulated by replacing ``.grad`` with a 
+.. new tensor ``.grad + new grad`` during :func:`oneflow.autograd.backward()` or 
+.. :func:`oneflow.Tensor.backward()`.
+In-place operations on Tensors
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Supporting in-place operations in autograd is a hard matter, and we discourage
+their use in most cases. Autograd's aggressive buffer freeing and reuse makes
+it very efficient and there are very few occasions when in-place operations
+actually lower memory usage by any significant amount. Unless you're operating
+under heavy memory pressure, you might never need to use them.
+Tensor autograd functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. autosummary::
+    :nosignatures:
+   oneflow.Tensor.grad
+   oneflow.Tensor.requires_grad
+   oneflow.Tensor.is_leaf
+   oneflow.Tensor.backward
+   oneflow.Tensor.detach
+   oneflow.Tensor.register_hook
+   oneflow.Tensor.retain_grad
+Function
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. autoclass:: Function
+.. currentmodule:: oneflow.autograd
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    Function.forward
+    Function.backward
+    Function.apply
+Context method mixins
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+When creating a new :class:`Function`, the following methods are available to `ctx`.
+.. currentmodule:: oneflow._oneflow_internal.autograd.Function
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    FunctionCtx.mark_non_differentiable
+    FunctionCtx.save_for_backward
+    FunctionCtx.saved_tensors
--- a/docs/source/comm.rst
+++ b/docs/source/comm.rst
-oneflow.comm
-===================================
-oneflow communication function
----------------------------------
-.. currentmodule:: oneflow.comm
-.. automodule:: oneflow.comm
-    :members: all_reduce, 
-        all_gather, 
-        broadcast,
-        scatter,
-        all_to_all,
-        reduce,
-        gather,
-        reduce_scatter,
-        send,
-        recv, 
-        barrier,
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -45,9 +45,14 @@ extensions = [
    "sphinx.ext.autodoc",
    "sphinx.ext.napoleon",
    "recommonmark",
+    "sphinx.ext.autosummary",
    "sphinx_copybutton",
 ]
+# build the templated autosummary files
+autosummary_generate = True
+numpydoc_show_class_members = False
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
@@ -107,7 +112,6 @@ html_static_path = ["_static"]
 #
 # html_sidebars = {}
 # -- Options for HTMLHelp output ---------------------------------------------
 # Output file base name for HTML help builder.

--- a/docs/source/cuda.rst
+++ b/docs/source/cuda.rst
 oneflow.cuda
 ===================================
-ONEFLOW.CUDA 
----------------------------------
+.. The documentation is referenced from: https://pytorch.org/docs/1.10/cuda.html.
 .. currentmodule:: oneflow.cuda
-.. automodule:: oneflow.cuda
-    :members: is_available,
+.. autosummary::
-        device_count,
+    :toctree: generated
-        current_device,
+    :nosignatures:
-        set_device,
-        synchronize,
+    is_available
-        manual_seed_all,
+    device_count
-        manual_seed,
+    current_device
-        empty_cache,
+    set_device
-        HalfTensor,
+    synchronize
-        FloatTensor,
+    get_device_properties
-        DoubleTensor,
+    get_device_capability
-        BoolTensor,
+    get_device_name
-        ByteTensor,
-        CharTensor,
+.. note::
-        IntTensor,
+   The :attr:`current_device` returns local rank as device index. It is different from the 'torch.current_device()' in PyTorch.
-        LongTensor,
\ No newline at end of file
+Random Number Generator
+-------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    manual_seed_all
+    manual_seed
+GPU tensor
+-----------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    HalfTensor
+    FloatTensor
+    DoubleTensor
+    BoolTensor
+    ByteTensor
+    CharTensor
+    IntTensor
+    LongTensor
+Memory management
+-----------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    empty_cache
\ No newline at end of file
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
 oneflow.distributed
 =========================================================
+.. note ::
+    Please refer to `OneFlow Distributed Overview <https://docs.oneflow.org/master/parallelism/01_introduction.html>`__
+    for a brief introduction to all features related to distributed training.
+OneFlow provides two ways to accomplish `Distributed Training`:
+- The first way is that users are recommended to use OneFlow's global Tensor for distributed training. Global Tensor regards the computing cluster as a supercomputing device, allowing users to write distributed training code just like in a single-machine environment.
+- OneFlow also provides a DDP（DistributedDataParallel） module aligned with PyTorch. DDP has been well-known and widely used in data parallelism by the majority of PyTorch users. Also see `PyTorch DDP introduction <https://pytorch.org/docs/1.10/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_.
+Basic
+-------------------------------
+When you start distributed training in OneFlow, the following functions can be used.
+.. currentmodule:: oneflow.env
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    get_world_size
+    get_rank
+    get_local_rank
+    get_node_size
+    init_rdma
+    rdma_is_initialized
+`Global Tensor`
+--------------------------------------------------------------
+Construct `Global Tensor`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+A `Global Tensor` can be created with a ``placement`` and a ``sbp``. The ``placement`` describes the physical devices of the global tensor will be allocated, and the ``sbp`` describes its distribution among these devices.
+::
+    >>>import oneflow as flow
+    >>> # Place a global tensor on cuda device of rank(process) 0 and 1
+    >>> placement = flow.placement(type="cuda", ranks=[0, 1])
+    >>> # Each rank's local data is a part data as a result of spliting global data on dim 0
+    >>> sbp = flow.sbp.split(dim=0)
+    >>> # Create a global tensor by randn
+    >>> x = flow.randn(4, 5, placement=placement, sbp=sbp)
+    >>> x.shape
+    oneflow.Size([4, 5])
+Convert `Local Tensor` to `Global Tensor`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+With ``Tensor.to_global`` interface, `Local Tensor` can create a `Global Tensor` and use that `Local Tensor` as its local component at the current node.
+Two `local tensors` with the shape of ``(2,5)`` are created separately on two devices. While after the ``to_global`` method, the `global tensor` with a shape of ``(4,5)`` is obtained.
+Code running on Node 0
+::
+    import oneflow as flow
+    x = flow.randn(2,5)
+    placement = flow.placement("cuda", [0,1])
+    sbp = flow.sbp.split(0)
+    x_global = x.to_global(placement=placement, sbp=sbp)
+    x_global.shape
+Code running on Node 1
+::
+    import oneflow as flow
+    x = flow.randn(2,5)
+    placement = flow.placement("cuda", [0,1])
+    sbp = flow.sbp.split(0)
+    x_global = x.to_global(placement=placement, sbp=sbp)
+    x_global.shape
+Redistribute `Global Tensor`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Redistributing a `Global Tensor` means moving its data to another device group (or placement), or changing its data distribution (or SBP) across the group, or both at the same time. The redistributed tensor is still a `Global Tensor`.
+::
+    >>> import oneflow as flow
+    >>> x = flow.tensor([1.0, 2.0], placement=flow.placement("cuda", ranks=[0, 1]), sbp=flow.sbp.split(0))
+    >>> y = x.to_global(placement=flow.placement("cuda", ranks=[2, 3]), sbp=flow.sbp.broadcast)
+According to the operator's semantics, OneFlow defines a sequence of valid input and output SBP combinations for each built-in operator. So OneFlow could automatically redistribute the `Global Tensor` to satisfy the operator's SBP requirements for its input Tensor. For example, the following code:
+::
+    >>> import oneflow as flow
+    >>> x = flow.randn(4, 4, 
+            placement=flow.placement("cuda", ranks=[0, 1]), 
+            sbp=flow.sbp.split(0))
+    >>> y = flow.randn(4, 4, 
+            placement=flow.placement("cuda", ranks=[0, 1]), 
+            sbp=flow.sbp.split(1))
+    >>> z = x + y
+When ``x + y`` is executed, since x is split along dimension ``0`` and y is split along dimension ``1``, their local components at each node can not be added directly, then OneFlow will automatically redistribute one of x and y to make them have the same SBP, and complete the add operation successfully.
+.. note ::
+    - Global Tensor can not be used in combination with DDP currently.
+    - Global Tensor requires all devices to execute at the same pace, otherwise, it may cause multi-process deadlock.
+Get Local Tensor from Global Tensor
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+With ``Tensor.to_local`` interface, the `Global Tensor` can return its local component at the current node.
+::
+    y = x.to_local()
+    y.is_local
+    True
+    y
+    tensor([[ 2.9186e-01, -3.9442e-01,  4.7072e-04, -3.2216e-01,  1.7788e-01],
+                [-4.5284e-01,  1.2361e-01, -3.5962e-01,  2.6651e-01,  1.2951e+00]],
+            device='cuda:0', dtype=oneflow.float32)
+DistributedDataParallel
+--------------------------------------------------------------
+For more information about DistributedDataParallel, see ``nn.parallel.DistributedDataParallel``
+The following script shows the process of using ``oneflow.nn.parallel.DistributedDataParallel`` for training data parallel: 
+.. code-block:: 
+    import oneflow as flow
+    from oneflow.nn.parallel import DistributedDataParallel as ddp
+    train_x = [
+        flow.tensor([[1, 2], [2, 3]], dtype=flow.float32),
+        flow.tensor([[4, 6], [3, 1]], dtype=flow.float32),
+    ]
+    train_y = [
+        flow.tensor([[8], [13]], dtype=flow.float32),
+        flow.tensor([[26], [9]], dtype=flow.float32),
+    ]
+    class Model(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.lr = 0.01
+            self.iter_count = 500
+            self.w = flow.nn.Parameter(flow.tensor([[0], [0]], dtype=flow.float32))
+        def forward(self, x):
+            x = flow.matmul(x, self.w)
+            return x
+    m = Model().to("cuda")
+    m = ddp(m)
+    loss = flow.nn.MSELoss(reduction="sum")
+    optimizer = flow.optim.SGD(m.parameters(), m.lr)
+    for i in range(0, m.iter_count):
+        rank = flow.env.get_rank()
+        x = train_x[rank].to("cuda")
+        y = train_y[rank].to("cuda")
+        y_pred = m(x)
+        l = loss(y_pred, y)
+        if (i + 1) % 50 == 0:
+            print(f"{i+1}/{m.iter_count} loss:{l}")
+        optimizer.zero_grad()
+        l.backward()
+        optimizer.step()
+    print(f"\nw:{m.w}")
+There are only two differences between the data parallelism training code and the stand-alone single-card script:
+- Use `DistributedDataParallel` to wrap the module object (`m = ddp(m)`)
+- Use `get_rank` to get the current device number and distribute the data to the device.
+Then use `launcher` to run the script, leave everything else to OneFlow, which makes distributed training as simple as stand-alone single-card training:
+::
+    python3 -m oneflow.distributed.launch --nproc_per_node 2 ./ddp_train.py
+Communication collectives
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. currentmodule:: oneflow.comm
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+        all_reduce
+        all_gather
+        all_gather_into_tensor
+        all_to_all
+        broadcast
+        barrier
+        gather
+        reduce
+        reduce_scatter
+        reduce_scatter_tensor
+        recv
+        scatter
+        send
+Launching distributed training
+--------------------------------------------------------------
 .. currentmodule:: oneflow.distributed
 run commands below to see more about usage.

--- a/docs/source/distributions.rst
+++ b/docs/source/distributions.rst
+oneflow.distributions
+==================================================
+.. contents:: oneflow.distributions
+    :depth: 2
+    :local:
+    :class: this-will-duplicate-information-and-it-is-still-useful-here
+    :backlinks: top
+.. currentmodule:: oneflow.distributions
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+    Distribution
+    Categorical 
--- a/docs/source/env.rst
+++ b/docs/source/env.rst
-oneflow.env
-===================================
-Environment
----------------------------------
-.. currentmodule:: oneflow
-.. autofunction:: oneflow.env.get_world_size
-.. autofunction:: oneflow.env.get_rank
-.. autofunction:: oneflow.env.get_local_rank
-.. autofunction:: oneflow.env.get_node_size
-.. autofunction:: oneflow.env.init_rdma
-.. autofunction:: oneflow.env.rdma_is_initialized
--- a/docs/source/environment_variables.rst
+++ b/docs/source/environment_variables.rst
--- a/docs/source/functional.rst
+++ b/docs/source/functional.rst
-oneflow.nn.functional
-===========================================
-Functional operations for neural networks
-------------------------------------------
-.. currentmodule:: oneflow.nn.functional
-.. autofunction:: conv1d
-.. autofunction:: conv2d
-.. autofunction:: conv3d
-.. autofunction:: conv_transpose1d
-.. autofunction:: conv_transpose2d
-.. autofunction:: conv_transpose3d
-.. autofunction:: adaptive_avg_pool1d
-.. autofunction:: adaptive_avg_pool2d
-.. autofunction:: adaptive_avg_pool3d
-.. autofunction:: relu
-.. autofunction:: hardsigmoid
-.. autofunction:: hardshrink
-.. autofunction:: hardswish
-.. autofunction:: hardtanh
-.. autofunction:: normalize
-.. autofunction:: layer_norm
-.. autofunction:: leaky_relu
-.. autofunction:: elu
-.. autofunction:: celu
-.. autofunction:: selu
-.. autofunction:: sigmoid
-.. autofunction:: pad
-.. autofunction:: prelu
-.. autofunction:: logsigmoid 
-.. autofunction:: log_softmax
-.. autofunction:: gelu
-.. autofunction:: glu
-.. autofunction:: softsign
-.. autofunction:: softmax 
-.. autofunction:: softplus
-.. autofunction:: tanh 
-.. autofunction:: threshold
-.. autofunction:: softshrink 
-.. autofunction:: silu
-.. autofunction:: mish
-.. autofunction:: one_hot
-.. autofunction:: triplet_margin_loss
-.. autofunction:: dropout 
-.. autofunction:: affine_grid
-.. autofunction:: grid_sample
-.. autofunction:: interpolate
-.. autofunction:: ctc_greedy_decoder
-.. autofunction:: sparse_softmax_cross_entropy
-.. autofunction:: embedding
-.. autofunction:: linear
-.. autofunction:: cosine_similarity
-.. autofunction:: cross_entropy
--- a/docs/source/graph.rst
+++ b/docs/source/graph.rst
 oneflow.nn.Graph
 ============================================================
 Base class for running neural networks in Static Graph Mode.
+Currently, there are two main ways to run models in deep learning frameworks, namely dynamic graphs and static graphs , which are also conventionally referred to as :ref:`dynamic graph` and :ref:`static graph` in OneFlow.
+Both approaches have their advantages and disadvantages, and OneFlow provides support for both approaches, with Eager mode being the default.
+Generally speaking, dynamic graphs are easier to use and static graphs have more performance advantages. :class:`oneflow.nn.Graph` module is provided by OneFlow to allow users to build static graphs and train models with Eager-like programming conventions.
+.. contents:: oneflow.nn.Graph
+    :depth: 2
+    :local:
+    :class: this-will-duplicate-information-and-it-is-still-useful-here
+    :backlinks: top
+.. _dynamic graph:
+Eager Mode to Static Graph Mode
+------------------------------------------------------------
+OneFlow runs in Eager mode by default.
+OneFlow's nn.Graph is programmed in a style very similar to Eager Mode, so it is possible to make small changes and get large performance gains.
+The following script shows the process of building a neural network in eager mode using the interface under ``oneflow.nn`` :
+.. code-block:: 
+    import oneflow as flow
+    import oneflow.nn as nn
+    class ModuleMyLinear(nn.Module):
+        def __init__(self, in_features, out_features):
+            super().__init__()
+            self.weight = nn.Parameter(flow.randn(in_features, out_features))
+            self.bias = nn.Parameter(flow.randn(out_features))
+        def forward(self, input):
+            return flow.matmul(input, self.weight) + self.bias
+    linear_model = ModuleMyLinear(4, 3)
+Eager ``nn.Module`` can be reused by ``nn.Graph``. The above script for eager mode can be changed to static Graph mode by adding just a few lines of code, which consists of the following steps:
+- Define your customized graph as a subclass of ``nn.Graph``
+- At the beginning of __init__. Call super().__init__() to let OneFlow do the necessary initialization of the Graph
+- Reuse the ``nn.Module`` object in Eager mode in __init__ (self.model = model)
+- Describe the computation in the ``build`` method
+- Instantiate your graph then call it.
+.. code-block:: 
+    class GraphMyLinear(nn.Graph):
+        def __init__(self):
+            super().__init__()
+            self.model = linear_model
+        def build(self, input):
+            return self.model(input)
+    graph_mylinear = GraphMyLinear()
+    input = flow.randn(1, 4)
+    out = graph_mylinear(input)
+    print(out)
+    tensor([[-0.3298, -3.7907,  0.1661]], dtype=oneflow.float32)
+.. _static graph:
+Static Graph Mode
 ------------------------------------------------------------
-.. currentmodule:: oneflow.nn
-.. autoclass:: oneflow.nn.Graph
-    :members: __init__,
+Constructing a Graph
-            build,
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-            __call__,
+Base class for training or evaluating a neural network in static graph mode.
-            add_optimizer,
-            set_grad_scaler,
+.. currentmodule:: oneflow.nn.Graph
-            state_dict,
-            load_state_dict,
+.. autosummary::
-            name,
+    :toctree: generated
-            debug,
+    :nosignatures:
-            __repr__,
-    :member-order: bysource
+    __init__
+    build
+    add_optimizer
+    set_grad_scaler
-.. autoclass:: oneflow.nn.graph.graph_config.GraphConfig
-    :members: enable_amp,
+Executing a Graph
-            enable_zero,
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-            allow_fuse_model_update_ops,
+Call a nn.Graph instance to run a customized graph.
-            allow_fuse_add_to_output,
-            allow_fuse_cast_scale,
+.. currentmodule:: oneflow.nn.Graph
-            set_gradient_accumulation_steps,
-            enable_cudnn_conv_heuristic_search_algo,
+.. autosummary::
-            enable_straighten_algorithm,
+    :toctree: generated
-    :member-order: bysource
+    :nosignatures:
+    __call__
-.. autoclass:: oneflow.nn.graph.block_config.BlockConfig
-    :members: stage_id,
-            set_stage,
+Config options on a Graph
-            activation_checkpointing,
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-    :member-order: bysource
+Optimization options of a nn.Graph.
+.. currentmodule:: oneflow.nn.graph.graph_config.GraphConfig
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    enable_amp
+    enable_zero
+    allow_fuse_model_update_ops
+    allow_fuse_add_to_output
+    allow_fuse_cast_scale
+    set_gradient_accumulation_steps
+    enable_cudnn_conv_heuristic_search_algo
+    enable_straighten_algorithm
+    enable_compress_memory
+Config options on a GraphModule
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+GraphModule is the graph representation of a nn.Module in a nn.Graph.
+When an nn.Module is added into an nn.Graph, it is wrapped into a ProxyModule. The ProxyModule has a GraphModule inside it.
+You can get and set the GraphModule to enable graph optimization on the nn.Module.
+.. currentmodule:: oneflow.nn.graph.graph_block.GraphModule
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    set_stage
+    activation_checkpointing
+Save & Load a Model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. currentmodule:: oneflow.nn.Graph
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    state_dict
+    load_state_dict
+Debug a Graph
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    __repr__
+    debug
+    name
--- a/docs/source/hub.rst
+++ b/docs/source/hub.rst
+oneflow.hub
+===================================
+.. The documentation is referenced from: 
+   https://pytorch.org/docs/1.10/hub.html
+Oneflow Hub is a pre-trained model repository designed to facilitate research reproducibility.
+Publishing models
+-----------------
+Oneflow Hub supports publishing pre-trained models(model definitions and pre-trained weights)
+to a github repository by adding a simple ``hubconf.py`` file;
+``hubconf.py`` can have multiple entrypoints. Each entrypoint is defined as a python function
+(example: a pre-trained model you want to publish).
+::
+    def entrypoint_name(*args, **kwargs):
+        # args & kwargs are optional, for models which take positional/keyword arguments.
+        ...
+How to implement an entrypoint?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Here is a code snippet specifies an entrypoint for ``resnet18`` model if we expand
+the implementation in ``Oneflow-Inc/vision/hubconf.py``.
+In most case importing the right function in ``hubconf.py`` is sufficient. Here we
+just want to use the expanded version as an example to show how it works.
+You can see the full script in
+`Oneflow-Inc/vision repo <https://github.com/Oneflow-Inc/vision/blob/master/hubconf.py>`_
+::
+    dependencies = ['oneflow']
+    from flowvision.models.resnet import resnet18 as _resnet18
+    # resnet18 is the name of entrypoint
+    def resnet18(pretrained=False, **kwargs):
+        """ # This docstring shows up in hub.help()
+        Resnet18 model
+        pretrained (bool): kwargs, load pretrained weights into the model
+        """
+        # Call the model, load pretrained weights
+        model = _resnet18(pretrained=pretrained, **kwargs)
+        return model
+- ``dependencies`` variable is a **list** of package names required to **load** the model. Note this might
+  be slightly different from dependencies required for training a model.
+- ``args`` and ``kwargs`` are passed along to the real callable function.
+- Docstring of the function works as a help message. It explains what does the model do and what
+  are the allowed positional/keyword arguments. It's highly recommended to add a few examples here.
+- Entrypoint function can either return a model(nn.module), or auxiliary tools to make the user workflow smoother, e.g. tokenizers.
+- Callables prefixed with underscore are considered as helper functions which won't show up in :func:`oneflow.hub.list()`.
+- Pretrained weights can either be stored locally in the github repo, or loadable by
+  :func:`oneflow.hub.load_state_dict_from_url()`. If less than 2GB, it's recommended to attach it to a `project release <https://help.github.com/en/articles/distributing-large-binaries>`_
+  and use the url from the release.
+  In the example above ``flowvision.models.resnet.resnet18`` handles ``pretrained``, alternatively you can put the following logic in the entrypoint definition.
+::
+    if pretrained:
+        # For checkpoint saved in local github repo, e.g. <RELATIVE_PATH_TO_CHECKPOINT>=weights/save.pth
+        dirname = os.path.dirname(__file__)
+        checkpoint = os.path.join(dirname, <RELATIVE_PATH_TO_CHECKPOINT>)
+        state_dict = oneflow.load(checkpoint)
+        model.load_state_dict(state_dict)
+        # For checkpoint saved elsewhere
+        checkpoint = 'https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip'
+        model.load_state_dict(oneflow.hub.load_state_dict_from_url(checkpoint, progress=False))
+Important Notice
+^^^^^^^^^^^^^^^^
+- The published models should be at least in a branch/tag. It can't be a random commit.
+Loading models from Hub
+-----------------------
+OneFlow Hub provides convenient APIs to explore all available models in hub
+through :func:`oneflow.hub.list()`, show docstring and examples through
+:func:`oneflow.hub.help()` and load the pre-trained models using
+:func:`oneflow.hub.load()`.
+.. automodule:: oneflow.hub
+.. autofunction:: list
+.. autofunction:: help
+.. autofunction:: load
+.. autofunction:: download_url_to_file
+.. autofunction:: load_state_dict_from_url
+Running a loaded model:
+^^^^^^^^^^^^^^^^^^^^^^^
+Note that ``*args`` and ``**kwargs`` in :func:`oneflow.hub.load()` are used to
+**instantiate** a model. After you have loaded a model, how can you find out
+what you can do with the model?
+A suggested workflow is
+- ``dir(model)`` to see all available methods of the model.
+- ``help(model.foo)`` to check what arguments ``model.foo`` takes to run
+To help users explore without referring to documentation back and forth, we strongly
+recommend repo owners make function help messages clear and succinct. It's also helpful
+to include a minimal working example.
+Where are my downloaded models saved?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The locations are used in the order of
+- Calling ``hub.set_dir(<PATH_TO_HUB_DIR>)``
+- ``$ONEFLOW_HOME/hub``, if environment variable ``ONEFLOW_HOME`` is set.
+- ``$XDG_CACHE_HOME/oneflow/hub``, if environment variable ``XDG_CACHE_HOME`` is set.
+- ``~/.cache/oneflow/hub``
+.. autofunction:: get_dir
+.. autofunction:: set_dir
+Caching logic
+^^^^^^^^^^^^^
+By default, we don't clean up files after loading it. Hub uses the cache by default if it already exists in the
+directory returned by :func:`~oneflow.hub.get_dir()`.
+Users can force a reload by calling ``hub.load(..., force_reload=True)``. This will delete
+the existing github folder and downloaded weights, reinitialize a fresh download. This is useful
+when updates are published to the same branch, users can keep up with the latest release.
+Known limitations:
+^^^^^^^^^^^^^^^^^^
+Oneflow hub works by importing the package as if it was installed. There are some side effects
+introduced by importing in Python. For example, you can see new items in Python caches
+``sys.modules`` and ``sys.path_importer_cache`` which is normal Python behavior.
+This also means that you may have import errors when importing different models
+from different repos, if the repos have the same sub-package names (typically, a
+``model`` subpackage). A workaround for these kinds of import errors is to
+remove the offending sub-package from the ``sys.modules`` dict; more details can
+be found in `this github issue
+<https://github.com/pytorch/hub/issues/243#issuecomment-942403391>`_.
+A known limitation that is worth mentioning here: users **CANNOT** load two different branches of
+the same repo in the **same python process**. It's just like installing two packages with the
+same name in Python, which is not good. Cache might join the party and give you surprises if you
+actually try that. Of course it's totally fine to load them in separate processes.
--- a/docs/source/image.rst
+++ b/docs/source/image.rst
@@ -3,9 +3,14 @@ oneflow.nn.image
 Image operations for neural networks
 --------------------------------------
 .. currentmodule:: oneflow.nn.image
-.. automodule:: oneflow.nn.image
+.. autosummary::
-    :members: Resize,
+    :toctree: generated
-        batch_align,
+    :nosignatures:
-        decode,
-        flip,
+    Resize
-        normalize
+    batch_align
+    decode
+    flip
+    normalize