0.9.1-rocm

a715222c · yuguo · f262efc9 · a715222c · a715222c · a715222c
Commit a715222c authored Feb 28, 2023 by yuguo
20 changed files
--- a/cmake/oneflow.cmake
+++ b/cmake/oneflow.cmake
@@ -45,7 +45,7 @@ file(
  "${PROJECT_SOURCE_DIR}/oneflow/user/*.*"
  "${PROJECT_SOURCE_DIR}/oneflow/api/*.*"
  "${PROJECT_SOURCE_DIR}/oneflow/maybe/*.*"
-  "${PROJECT_SOURCE_DIR}/oneflow/extension/python/*.*")
+  "${PROJECT_SOURCE_DIR}/oneflow/extension/*.*")

 foreach(oneflow_single_file ${oneflow_all_src})
  # Verify whether this file is for other platforms
@@ -80,6 +80,21 @@ foreach(oneflow_single_file ${oneflow_all_src})
    if(BUILD_CUDA)
      list(APPEND of_all_obj_cc ${oneflow_single_file})
    endif()
+    if(BUILD_ROCM)
+      if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/(core|user)/.*\\.cu$")
+        get_filename_component(oneflow_single_file_hip_cpp_dir ${oneflow_single_file} DIRECTORY)
+        get_filename_component(oneflow_single_file_hip_cpp ${oneflow_single_file} NAME_WE)
+        add_custom_command(
+          OUTPUT "${oneflow_single_file_hip_cpp_dir}/${oneflow_single_file_hip_cpp}_hip.cpp" 
+          COMMAND ${CMAKE_COMMAND} -E copy "${oneflow_single_file}" "${oneflow_single_file_hip_cpp_dir}/${oneflow_single_file_hip_cpp}_hip.cpp" 
+          DEPENDS "${oneflow_single_file}" 
+        ) 
+        list(APPEND of_all_obj_cc ${oneflow_single_file_hip_cpp_dir}/${oneflow_single_file_hip_cpp}_hip.cpp)
+      endif()
+      if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/(core|user)/.*\\.cuh$")
+        list(APPEND of_all_obj_cc ${oneflow_single_file})
+      endif()
+    endif()
    set(group_this ON)
  endif()

@@ -96,8 +111,7 @@ foreach(oneflow_single_file ${oneflow_all_src})
      set(group_this ON)
    endif()

-    if("${oneflow_single_file}" MATCHES
-       "^${PROJECT_SOURCE_DIR}/oneflow/extension/python/.*\\.(h|cpp)$")
+    if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/extension/.*\\.(c|h|cpp)$")
      list(APPEND of_pyext_obj_cc ${oneflow_single_file})
      set(group_this ON)
    endif()
@@ -105,7 +119,7 @@ foreach(oneflow_single_file ${oneflow_all_src})

  if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/(core|user|maybe)/.*\\.cpp$")
    if("${oneflow_single_file}" MATCHES
-       "^${PROJECT_SOURCE_DIR}/oneflow/(core|user|maybe)/.*_test\\.cpp$")
+       "^${PROJECT_SOURCE_DIR}/oneflow/(core|user|maybe|thread)/.*_test\\.cpp$")
      # test file
      list(APPEND of_all_test_cc ${oneflow_single_file})
    elseif(APPLE AND "${oneflow_single_file}" MATCHES
@@ -136,6 +150,7 @@ add_custom_target(
  of_format
  COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_license_format.py -i
          ${CMAKE_CURRENT_SOURCE_DIR}/oneflow --fix
+          --exclude="oneflow/user/kernels/fmha_flash_attention"
  COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_license_format.py -i
          ${ONEFLOW_PYTHON_DIR} --fix --exclude="oneflow/include" --exclude="oneflow/core"
  COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_clang_format.py --source_dir
@@ -254,20 +269,22 @@ if("${LLVM_MONO_REPO_URL}" STREQUAL
      "https://github.com/llvm/llvm-project/archive/6a9bbd9f20dcd700e28738788bb63a160c6c088c.zip"
   OR "${LLVM_MONO_REPO_URL}" STREQUAL
      "https://github.com/llvm/llvm-project/archive/32805e60c9de1f82887cd2af30d247dcabd2e1d3.zip"
+   OR "${LLVM_MONO_REPO_URL}" STREQUAL
+      "https://github.com/llvm/llvm-project/archive/6d6268dcbf0f48e43f6f9fe46b3a28c29ba63c7d.zip"
   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "f2f17229cf21049663b8ef4f2b6b8062"
   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "6b7c6506d5922de9632c8ff012b2f945"
   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e0ea669a9f0872d35bffda5ec6c5ac6f"
   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "241a333828bba1efa35aff4c4fc2ce87"
   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "075fbfdf06cb3f02373ea44971af7b03"
-   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e412dc61159b5e929b0c94e44b11feb2")
+   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e412dc61159b5e929b0c94e44b11feb2"
+   OR "${LLVM_MONO_REPO_MD5}" STREQUAL "334997b4879aba15d9323a732356cf2a")
  unset(LLVM_MONO_REPO_URL CACHE)
  unset(LLVM_MONO_REPO_MD5 CACHE)
 endif()
-set(LLVM_MONO_REPO_URL
-    "https://github.com/llvm/llvm-project/archive/6d6268dcbf0f48e43f6f9fe46b3a28c29ba63c7d.zip"
+set(LLVM_MONO_REPO_URL "https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-15.0.6.zip"
    CACHE STRING "")
 use_mirror(VARIABLE LLVM_MONO_REPO_URL URL ${LLVM_MONO_REPO_URL})
-set(LLVM_MONO_REPO_MD5 "334997b4879aba15d9323a732356cf2a" CACHE STRING "")
+set(LLVM_MONO_REPO_MD5 "1ccc00accc87a1a5d42a275d6e31cd8c" CACHE STRING "")
 set(ONEFLOW_BUILD_ROOT_DIR "${PROJECT_BINARY_DIR}")
 add_subdirectory(${PROJECT_SOURCE_DIR}/oneflow/ir)
 if(WITH_MLIR)
@@ -306,9 +323,9 @@ elseif(UNIX)
    ${oneflow_third_party_libs}
    ${EXTERNAL_TARGETS}
    -Wl,--no-whole-archive
+    -Wl,--as-needed
    -ldl
-    -lrt
-    -Wl,--version-script ${PROJECT_SOURCE_DIR}/version_script.lds)
+    -lrt)
  if(BUILD_CUDA)
    target_link_libraries(oneflow CUDA::cudart_static)
  endif()
@@ -317,6 +334,43 @@ elseif(WIN32)
  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /WHOLEARCHIVE:oneflow")
 endif()

+if (BUILD_ROCM)
+  # AMD compiler fails to compile these three files with '-O1/2/3'.
+  # The value of `COMPILE_OPTIONS` target property is added after CMAKE_<LANG>_FLAGS_<CONFIG>,
+  # so '-O0' will override '-O1/2/3'.
+  set_source_files_properties(${PROJECT_SOURCE_DIR}/oneflow/user/kernels/median_with_indices_kernel_hip.cpp
+                              ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/radix_sort_top_k_kernel_hip.cpp
+                              ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/arg_sort_kernel_hip.cpp
+                              #${PROJECT_SOURCE_DIR}/oneflow/core/ep/cuda/primitive/broadcast_elementwise_binary_math1_hip.cpp
+                              PROPERTIES COMPILE_OPTIONS "-O0")
+endif()
+
+if(BUILD_CUDA)
+  string(JOIN "," CUDA_REAL_ARCHS ${CUDA_REAL_ARCHS_LIST})
+  set_source_files_properties(${PROJECT_SOURCE_DIR}/oneflow/core/hardware/cuda_device_descriptor.cpp
+                              PROPERTIES COMPILE_FLAGS "-DCUDA_REAL_ARCHS=\"${CUDA_REAL_ARCHS}\"")
+endif()
+
+if(BUILD_CUDA AND WITH_CUTLASS)
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "10.1")
+    add_definitions(-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1)
+  endif()
+
+  set_property(
+    SOURCE ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/fused_multi_head_attention_inference_kernel.cu
+    APPEND PROPERTY INCLUDE_DIRECTORIES
+                    ${CUTLASS_INSTALL_DIR}/examples/41_fused_multi_head_attention)
+  set_property(SOURCE ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/fused_glu_kernel.cu APPEND
+               PROPERTY INCLUDE_DIRECTORIES ${CUTLASS_INSTALL_DIR}/examples/45_dual_gemm)
+  if("${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA")
+    set_property(
+      SOURCE
+        ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/fused_multi_head_attention_inference_kernel.cu
+      APPEND
+      PROPERTY COMPILE_OPTIONS "--use_fast_math")
+  endif()
+endif()
+
 # oneflow api common
 if(BUILD_PYTHON OR BUILD_CPP_API)
  file(GLOB_RECURSE of_api_common_files ${PROJECT_SOURCE_DIR}/oneflow/api/common/*.h
@@ -343,6 +397,8 @@ if(BUILD_PYTHON)
  add_dependencies(of_pyext_obj oneflow)

  pybind11_add_module(oneflow_internal ${PYBIND11_SRCS} ${of_pybind_obj_cc} ${PYBIND_REGISTRY_CC})
+  set_property(TARGET oneflow_internal APPEND PROPERTY BUILD_RPATH "\$ORIGIN/../nvidia/cublas/lib")
+  set_property(TARGET oneflow_internal APPEND PROPERTY BUILD_RPATH "\$ORIGIN/../nvidia/cudnn/lib")
  set_compile_options_to_oneflow_target(oneflow_internal)
  set_property(TARGET oneflow_internal PROPERTY CXX_VISIBILITY_PRESET "default")
  add_dependencies(oneflow_internal of_functional_obj of_functional_tensor_obj of_op_schema)
@@ -419,6 +475,9 @@ if(BUILD_TESTING)
    oneflow_add_test(oneflow_testexe SRCS ${of_all_test_cc} TEST_NAME oneflow_test)
    target_link_libraries(oneflow_testexe ${of_libs} ${oneflow_third_party_libs} glog::glog
                          ${oneflow_test_libs})
+    if(WITH_MLIR)
+      target_link_libraries(oneflow_testexe MLIROneFlowExtension)
+    endif()
  endif()

  if(BUILD_CPP_API)
@@ -524,6 +583,10 @@ if(BUILD_CPP_API)
  if(BUILD_CUDA)
    checkdirandappendslash(DIR ${NCCL_LIBRARY_DIR} OUTPUT NCCL_LIBRARY_DIR_APPENDED)
    list(APPEND LIBONEFLOW_THIRD_PARTY_DIRS ${NCCL_LIBRARY_DIR_APPENDED})
+    if(WITH_CUTLASS)
+      checkdirandappendslash(DIR ${CUTLASS_LIBRARY_DIR} OUTPUT CUTLASS_LIBRARY_DIR_APPENDED)
+      list(APPEND LIBONEFLOW_THIRD_PARTY_DIRS ${CUTLASS_LIBRARY_DIR_APPENDED})
+    endif()
  endif()

  install(
@@ -555,6 +618,7 @@ if(BUILD_CPP_API)
    llvm-PerfectShuffle
    llvm-tblgen
    mlir-tblgen
+    mlir-pdll
    obj2yaml
    oneflow_tblgen
    yaml-bench

--- a/cmake/op_schema.cmake
+++ b/cmake/op_schema.cmake
@@ -38,7 +38,13 @@ set(ONEFLOW_OP_GROUPS
    "TRIGONOMETRIC"
    "UNARY"
    "UPSAMPLE"
-    "ONE_EMBEDDING")
+    "ONE_EMBEDDING"
+    "LINEAR_ALGEBRA"
+    "SYSTEM")
+if(WITH_MLIR)
+  list(APPEND ONEFLOW_OP_GROUPS "MLIR_JIT")
+endif(WITH_MLIR)
+
 foreach(OP_GROUP_NAME IN LISTS ONEFLOW_OP_GROUPS)
  list(APPEND ONEFLOW_SCHEMA_TABLEGEN_FLAGS "-DGET_ONEFLOW_${OP_GROUP_NAME}_OP_DEFINITIONS")
 endforeach()

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -137,7 +137,8 @@ endif()
 list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${RE2_INCLUDE_DIR})

 if(BUILD_CUDA)
-  if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
+  # Always use third_party/cub for Clang CUDA in case of compatibility issues
+  if("${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA" AND CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
    if(CMAKE_CXX_STANDARD LESS 14)
      add_definitions(-DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT)
      add_definitions(-DCUB_IGNORE_DEPRECATED_CPP11)
@@ -150,6 +151,7 @@ if(BUILD_CUDA)
    list(APPEND oneflow_third_party_dependencies cub_copy_headers_to_destination)
  endif()
  include(nccl)
+  include(cutlass)

  list(APPEND oneflow_third_party_libs ${NCCL_LIBRARIES})
  list(APPEND oneflow_third_party_libs ${CUDNN_LIBRARIES})
@@ -159,12 +161,19 @@ if(BUILD_CUDA)

  list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUDNN_INCLUDE_DIRS} ${CUB_INCLUDE_DIR}
       ${NCCL_INCLUDE_DIR})
-endif()

+  if(WITH_CUTLASS)
+    list(APPEND oneflow_third_party_dependencies cutlass)
+    list(APPEND oneflow_third_party_dependencies cutlass_copy_examples_to_destination)
+    list(APPEND oneflow_third_party_libs ${CUTLASS_LIBRARIES})
+    list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUTLASS_INCLUDE_DIR})
+  endif()
+endif()

 if (BUILD_ROCM)
  # Find rocm packages
  find_package(hip)
+  find_package(hipfft)
  find_package(hipblas)
  find_package(hipcub)
  find_package(hiprand)
@@ -173,11 +182,31 @@ if (BUILD_ROCM)
  find_package(rccl)
  add_definitions(-DWITH_ROCM)
  add_definitions(-D__HIP_PLATFORM_HCC__)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
+  add_definitions(-D__HIPCC__)
+  if (BUILD_ROCM_GRAPHS)
+    add_definitions(-DWITH_ROCM_GRAPHS)
+  endif()
+  
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ -D__HIPCC__")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ -D__HIPCC__")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --gpu-max-threads-per-block=1024")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-macro-redefined")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-inconsistent-missing-override")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-exceptions")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-shift-count-negative")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-shift-count-overflow")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-duplicate-decl-specifier")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-implicit-int-float-conversion")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pass-failed")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-gpu-rdc")
+
  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mcmodel=large")
  set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -mcmodel=large")
+
  list(APPEND oneflow_third_party_libs hip::device)
+  list(APPEND oneflow_third_party_libs hip::hipfft)
  list(APPEND oneflow_third_party_libs roc::hipblas)
  list(APPEND oneflow_third_party_libs hip::hipcub)
  list(APPEND oneflow_third_party_libs roc::rocrand)
@@ -186,16 +215,17 @@ if (BUILD_ROCM)
  link_directories(${ROCM_PATH}/rccl/lib)
  list(APPEND oneflow_third_party_libs rccl)
  list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${HIP_INCLUDE_DIRS}
+                                               ${HIPFFT_INCLUDE_DIRS}
                                               ${HIPBLAS_INCLUDE_DIRS}
                                               ${HIPCUB_INCLUDE_DIRS}
                                               "${ROCM_PATH}/hiprand/include"
                                               "${ROCM_PATH}/rocrand/include"
+                                               "${ROCM_PATH}/roctracer/include"
                                               ${MIOPEN_INCLUDE_DIRS}
                                               ${RCCL_INCLUDE_DIRS})
  message(STATUS "ONEFLOW_THIRD_PARTY_INCLUDE_DIRS: ${ONEFLOW_THIRD_PARTY_INCLUDE_DIRS}")
 endif()

-
 if(BUILD_RDMA)
  if(UNIX)
    include(CheckIncludeFiles)

--- a/cmake/third_party/cutlass.cmake
+++ b/cmake/third_party/cutlass.cmake
+include(ExternalProject)
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+  set(WITH_CUTLASS_INIT OFF)
+else()
+  set(WITH_CUTLASS_INIT ON)
+endif()
+
+set(WITH_CUTLASS ${WITH_CUTLASS_INIT} CACHE BOOL "")
+
+if(WITH_CUTLASS)
+
+  add_definitions(-DWITH_CUTLASS)
+
+  find_package(Threads)
+
+  set(CUTLASS_PROJECT cutlass)
+
+  set(CUTLASS_INSTALL_DIR ${THIRD_PARTY_DIR}/cutlass)
+
+  set(CUTLASS_INCLUDE_DIR ${CUTLASS_INSTALL_DIR}/include CACHE PATH "" FORCE)
+  set(CUTLASS_LIBRARY_DIR ${CUTLASS_INSTALL_DIR}/lib CACHE PATH "" FORCE)
+  set(CUTLASS_LIBRARIES ${CUTLASS_LIBRARY_DIR}/libcutlass.so)
+  set(CUTLASS_SOUREC_DIR ${CMAKE_CURRENT_BINARY_DIR}/cutlass/src/cutlass/)
+
+  foreach(arch ${CUDA_REAL_ARCHS_LIST})
+    if(arch GREATER_EQUAL 70)
+      list(APPEND CUTLASS_REAL_ARCHS ${arch})
+    endif()
+  endforeach()
+
+  if(THIRD_PARTY)
+    ExternalProject_Add(
+      ${CUTLASS_PROJECT}
+      PREFIX cutlass
+      URL ${CUTLASS_URL}
+      URL_MD5 ${CUTLASS_MD5}
+      UPDATE_COMMAND ""
+      BUILD_BYPRODUCTS ${CUTLASS_LIBRARIES}
+      CMAKE_ARGS -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+                 -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
+                 -DCMAKE_CXX_FLAGS_DEBUG:STRING=${CMAKE_CXX_FLAGS_DEBUG}
+                 -DCMAKE_CXX_FLAGS_RELEASE:STRING=${CMAKE_CXX_FLAGS_RELEASE}
+      CMAKE_CACHE_ARGS
+        -DCMAKE_CUDA_COMPILER:STRING=${CUDAToolkit_NVCC_EXECUTABLE}
+        -DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER}
+        -DCMAKE_CXX_COMPILER_LAUNCHER:STRING=${CMAKE_CXX_COMPILER_LAUNCHER}
+        -DCMAKE_INSTALL_PREFIX:PATH=${CUTLASS_INSTALL_DIR}
+        -DCMAKE_INSTALL_LIBDIR:PATH=${CUTLASS_LIBRARY_DIR}
+        -DCMAKE_INSTALL_MESSAGE:STRING=${CMAKE_INSTALL_MESSAGE}
+        -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+        -DCUTLASS_LIBRARY_OPERATIONS:STRING=conv2d
+        -DCUTLASS_LIBRARY_KERNELS:STRING=simt_hfprop_*,tensorop_f16_*fprop,tensorop_h*fprop
+        -DCUTLASS_ENABLE_EXAMPLES:BOOL=OFF
+        -DCUTLASS_ENABLE_PROFILER:BOOL=OFF
+        -DCUTLASS_ENABLE_LIBRARY:BOOL=ON
+        -DCUTLASS_NVCC_ARCHS:STRING=${CUTLASS_REAL_ARCHS}
+        -DCUTLASS_ENABLE_TESTS:BOOL=OFF
+        -DCUTLASS_UNITY_BUILD_ENABLED:BOOL=ON
+        -DCUTLASS_LIBRARY_DEBUG_POSTFIX:STRING=
+        -DCUTLASS_NVCC_EMBED_PTX:BOOL=OFF)
+
+    add_custom_target(cutlass_copy_examples_to_destination DEPENDS cutlass)
+    set(CUTLASS_SOURCE_EXAMPLES_DIR ${CUTLASS_SOUREC_DIR}/examples)
+
+    set(CUTLASS_INSTALL_EXAMPLES_FILES
+        "41_fused_multi_head_attention/iterators/make_residual_last.h"
+        "41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h"
+        "41_fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h"
+        "41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h"
+        "41_fused_multi_head_attention/mma_from_smem.h"
+        "41_fused_multi_head_attention/epilogue_rescale_output.h"
+        "41_fused_multi_head_attention/attention_scaling_coefs_updater.h"
+        "41_fused_multi_head_attention/gemm_kernel_utils.h"
+        "41_fused_multi_head_attention/fmha_grouped_problem_visitor.h"
+        "41_fused_multi_head_attention/fmha_grouped.h"
+        "41_fused_multi_head_attention/default_fmha_grouped.h"
+        "41_fused_multi_head_attention/epilogue_pipelined.h"
+        "41_fused_multi_head_attention/epilogue_thread_apply_logsumexp.h"
+        "41_fused_multi_head_attention/kernel_forward.h"
+        "41_fused_multi_head_attention/gemm/custom_mma_multistage.h"
+        "41_fused_multi_head_attention/gemm/custom_mma_base.h"
+        "41_fused_multi_head_attention/gemm/custom_mma.h"
+        "41_fused_multi_head_attention/gemm/custom_mma_pipelined.h"
+        "41_fused_multi_head_attention/find_default_mma.h"
+        "41_fused_multi_head_attention/debug_utils.h"
+        "45_dual_gemm/test_run.h"
+        "45_dual_gemm/kernel/dual_gemm.h"
+        "45_dual_gemm/device/dual_gemm.h"
+        "45_dual_gemm/dual_gemm_run.h"
+        "45_dual_gemm/thread/left_silu_and_mul.h"
+        "45_dual_gemm/threadblock/dual_mma_multistage.h"
+        "45_dual_gemm/threadblock/dual_epilogue.h"
+        "45_dual_gemm/threadblock/dual_mma_base.h")
+
+    foreach(filename ${CUTLASS_INSTALL_EXAMPLES_FILES})
+      add_custom_command(
+        TARGET cutlass_copy_examples_to_destination
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CUTLASS_SOURCE_EXAMPLES_DIR}/${filename}
+                ${CUTLASS_INSTALL_DIR}/examples/${filename})
+    endforeach()
+
+  endif(THIRD_PARTY)
+endif(WITH_CUTLASS)
--- a/cmake/third_party/eigen.cmake
+++ b/cmake/third_party/eigen.cmake
@@ -7,12 +7,9 @@ set(EIGEN_URL https://github.com/Oneflow-Inc/eigen-git-mirror/archive/refs/tags/
 set(EIGEN_MD5 a23cb70e12d1bf9b09cb28af51bc26ae)
 use_mirror(VARIABLE EIGEN_URL URL ${EIGEN_URL})

-add_definitions(-DEIGEN_NO_AUTOMATIC_RESIZING)
 if(BUILD_CUDA)
  add_definitions(-DEIGEN_USE_GPU)
 endif()
-add_definitions(-DEIGEN_NO_MALLOC)
-#add_definitions(-DEIGEN_NO_AUTOMATIC_RESIZING -DEIGEN_NO_MALLOC -DEIGEN_USE_GPU)

 if(THIRD_PARTY)


--- a/cmake/third_party/nccl.cmake
+++ b/cmake/third_party/nccl.cmake
@@ -34,11 +34,36 @@ else()
  set(NCCL_INCLUDE_DIR ${NCCL_INSTALL_DIR}/include)
  set(NCCL_LIBRARY_DIR ${NCCL_INSTALL_DIR}/lib)

+  # Versions 2.13 and above may cause deadlocks
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
+    set(NCCL_URL https://github.com/NVIDIA/nccl/archive/refs/tags/v2.15.1-1.tar.gz)
+    set(NCCL_MD5 37b787ff8934cd9374b4612f663c17fa)
+  else()
    set(NCCL_URL https://github.com/NVIDIA/nccl/archive/refs/tags/v2.12.10-1.tar.gz)
+    set(NCCL_MD5 bdb91f80b78c99831f09ca8bb28a1032)
+  endif()
+
  use_mirror(VARIABLE NCCL_URL URL ${NCCL_URL})

  list(APPEND NCCL_LIBRARIES ${NCCL_LIBRARY_DIR}/${NCCL_LIBRARY_NAME})

+  set(NCCL_ARCHS_LIST ${CUDA_REAL_ARCHS_LIST})
+
+  # remove redundant archs, https://github.com/NVIDIA/nccl/blob/cb111f764a6d46370f24f75101d6b219bb2dda54/makefiles/common.mk#L28
+  if("70" IN_LIST NCCL_ARCHS_LIST AND "75" IN_LIST NCCL_ARCHS_LIST)
+    list(REMOVE_ITEM NCCL_ARCHS_LIST "75")
+  endif()
+  if("80" IN_LIST NCCL_ARCHS_LIST AND "86" IN_LIST NCCL_ARCHS_LIST)
+    list(REMOVE_ITEM NCCL_ARCHS_LIST "86")
+  endif()
+  if("80" IN_LIST NCCL_ARCHS_LIST AND "89" IN_LIST NCCL_ARCHS_LIST)
+    list(REMOVE_ITEM NCCL_ARCHS_LIST "89")
+  endif()
+
+  foreach(arch ${NCCL_ARCHS_LIST})
+    string(APPEND NCCL_GENCODE "-gencode=arch=compute_${arch},code=sm_${arch} ")
+  endforeach()
+
  if(THIRD_PARTY)

    include(ProcessorCount)
@@ -47,11 +72,12 @@ else()
      nccl
      PREFIX nccl
      URL ${NCCL_URL}
-      URL_MD5 bdb91f80b78c99831f09ca8bb28a1032
+      URL_MD5 ${NCCL_MD5}
      UPDATE_COMMAND ""
      CONFIGURE_COMMAND ""
      BUILD_IN_SOURCE 1
      BUILD_COMMAND make -j${PROC_NUM} src.build CUDA_HOME=${CUDATOOLKIT_BIN_ROOT}
+                    NVCC_GENCODE=${NCCL_GENCODE}
      INSTALL_COMMAND make src.install PREFIX=${NCCL_INSTALL_DIR}
      BUILD_BYPRODUCTS ${NCCL_LIBRARIES})


--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -14,3 +14,4 @@ dataclasses; python_version<"3.7"
 cmakelang==0.6.13
 pytest-xdist
 rich
+portalocker
--- a/docs/source/auto_parallel.rst
+++ b/docs/source/auto_parallel.rst
+Auto Parallelism
+====================================================
+
+As the scale of deep-learning models grows larger and larger, distributed training,
+or parallelism, is needed. Data parallelism and model parallelism has been designed
+to speed up the training and solve memory issues.
+
+In oneflow, SBP signature enables users to configure parallelism policy easily.
+However, users still need to specify the SBP property for each operator, or most of them.
+Users might spend a couple of days digging into the detail of parallelism and get a
+low throughput just because of a slight mistake in the configuration of SBP signature.
+
+.. note::
+
+   It only works on :doc:`graph` mode.
+
+
+Our strength
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To get rid of all those configurations for SBP signatures, we developed auto parallelism.
+Still, configurations of placement are necessary and we have not supported auto placement
+yet. If you read this paragraph before you rush into any SBP stuff, then congratulation,
+you do not need to learn SBPs. You can start writing your code as you did under CPU mode.
+Our auto parallelism would generate a fast strategy customized for your specific models,
+the size of parameters, and the number of available GPUs.
+
+
+How to use auto parallelism?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You just need to simply enable the configuration settings in the model
+of :doc:`graph` .
+
+Example::
+
+    import oneflow as flow
+    class SubclassGraph(flow.nn.Graph):
+        def __init__(self):
+            super().__init__() # MUST be called
+            # auto parallelism configuration
+            self.config.enable_auto_parallel(True)
+            # other configurations about auto parallelism
+            # ......
+
+        def build(self):
+            pass
+
+.. warning::
+
+   If you enable auto parallelism, OneFlow will take care of the SBP configurations
+   of operators except for explicit ``to_global`` functions.
+
+
+Configuration API for auto parallelism
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. currentmodule:: oneflow.nn.graph.graph_config.GraphConfig
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    enable_auto_parallel
+    enable_auto_parallel_ignore_user_sbp_config
+    set_auto_parallel_computation_cost_ratio
+    set_auto_parallel_wait_time
+    enable_auto_parallel_trunk_algo
+    enable_auto_parallel_sbp_collector
+
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
 oneflow.autograd
-================================================
-Functions and classes for autograd.
---------------------------------------------------
+====================================================
+
+.. The documentation is referenced from:
+   https://pytorch.org/docs/1.10/autograd.html
+
+``oneflow.autograd`` provides classes and functions implementing automatic differentiation of arbitrary scalar 
+valued functions. It requires minimal changes to the existing code - you only need to declare ``Tensor`` s 
+for which gradients should be computed with the ``requires_grad=True`` keyword. As of now, we only support 
+autograd for floating point ``Tensor`` types ( half, float, double and bfloat16).
+
+
+.. currentmodule:: oneflow.autograd
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    backward
+    grad
+
+Locally disabling gradient computation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    no_grad
+    enable_grad
+    set_grad_enabled
+    inference_mode
+
+.. TODO(wyg): uncomment this after aligning accumulate grad
+.. Default gradient layouts
+.. ^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. A ``param.grad`` is accumulated by replacing ``.grad`` with a 
+.. new tensor ``.grad + new grad`` during :func:`oneflow.autograd.backward()` or 
+.. :func:`oneflow.Tensor.backward()`.
+
+In-place operations on Tensors
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Supporting in-place operations in autograd is a hard matter, and we discourage
+their use in most cases. Autograd's aggressive buffer freeing and reuse makes
+it very efficient and there are very few occasions when in-place operations
+actually lower memory usage by any significant amount. Unless you're operating
+under heavy memory pressure, you might never need to use them.
+
+Tensor autograd functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. autosummary::
+    :nosignatures:
+
+   oneflow.Tensor.grad
+   oneflow.Tensor.requires_grad
+   oneflow.Tensor.is_leaf
+   oneflow.Tensor.backward
+   oneflow.Tensor.detach
+   oneflow.Tensor.register_hook
+   oneflow.Tensor.retain_grad
+
+Function
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: Function
 .. currentmodule:: oneflow.autograd
-.. autoclass:: oneflow.autograd.Function
-    :members: apply,
-    :special-members: __call__,
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Function.forward
+    Function.backward
+    Function.apply
+
+Context method mixins
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+When creating a new :class:`Function`, the following methods are available to `ctx`.
+
+.. currentmodule:: oneflow._oneflow_internal.autograd.Function
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
    
-.. automodule:: oneflow.autograd
-    :members: grad,
-      backward,
+    FunctionCtx.mark_non_differentiable
+    FunctionCtx.save_for_backward
+    FunctionCtx.saved_tensors
--- a/docs/source/comm.rst
+++ b/docs/source/comm.rst
-oneflow.comm
-===================================
-oneflow communication function
----------------------------------
-.. currentmodule:: oneflow.comm
-.. automodule:: oneflow.comm
-    :members: all_reduce, 
-        all_gather, 
-        broadcast,
-        scatter,
-        all_to_all,
-        reduce,
-        gather,
-        reduce_scatter,
-        send,
-        recv, 
-        barrier,
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -45,9 +45,14 @@ extensions = [
    "sphinx.ext.autodoc",
    "sphinx.ext.napoleon",
    "recommonmark",
+    "sphinx.ext.autosummary",
    "sphinx_copybutton",
 ]

+# build the templated autosummary files
+autosummary_generate = True
+numpydoc_show_class_members = False
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]

@@ -107,7 +112,6 @@ html_static_path = ["_static"]
 #
 # html_sidebars = {}

-
 # -- Options for HTMLHelp output ---------------------------------------------

 # Output file base name for HTML help builder.

--- a/docs/source/cuda.rst
+++ b/docs/source/cuda.rst
 oneflow.cuda
 ===================================
-ONEFLOW.CUDA 
----------------------------------
+
+.. The documentation is referenced from: https://pytorch.org/docs/1.10/cuda.html.
+
 .. currentmodule:: oneflow.cuda
-.. automodule:: oneflow.cuda
-    :members: is_available,
-        device_count,
-        current_device,
-        set_device,
-        synchronize,
-        manual_seed_all,
-        manual_seed,
-        empty_cache,
-        HalfTensor,
-        FloatTensor,
-        DoubleTensor,
-        BoolTensor,
-        ByteTensor,
-        CharTensor,
-        IntTensor,
-        LongTensor,
\ No newline at end of file
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    is_available
+    device_count
+    current_device
+    set_device
+    synchronize
+    get_device_properties
+    get_device_capability
+    get_device_name
+
+.. note::
+   The :attr:`current_device` returns local rank as device index. It is different from the 'torch.current_device()' in PyTorch.
+
+
+Random Number Generator
+-------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    manual_seed_all
+    manual_seed
+
+
+GPU tensor
+-----------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    HalfTensor
+    FloatTensor
+    DoubleTensor
+    BoolTensor
+    ByteTensor
+    CharTensor
+    IntTensor
+    LongTensor
+
+Memory management
+-----------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    empty_cache
+    
\ No newline at end of file
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
 oneflow.distributed
 =========================================================

+.. note ::
+    Please refer to `OneFlow Distributed Overview <https://docs.oneflow.org/master/parallelism/01_introduction.html>`__
+    for a brief introduction to all features related to distributed training.
+
+OneFlow provides two ways to accomplish `Distributed Training`:
+
+- The first way is that users are recommended to use OneFlow's global Tensor for distributed training. Global Tensor regards the computing cluster as a supercomputing device, allowing users to write distributed training code just like in a single-machine environment.
+
+- OneFlow also provides a DDP（DistributedDataParallel） module aligned with PyTorch. DDP has been well-known and widely used in data parallelism by the majority of PyTorch users. Also see `PyTorch DDP introduction <https://pytorch.org/docs/1.10/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_.
+
+
+
+Basic
+-------------------------------
+When you start distributed training in OneFlow, the following functions can be used.
+
+.. currentmodule:: oneflow.env
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    get_world_size
+    get_rank
+    get_local_rank
+    get_node_size
+    init_rdma
+    rdma_is_initialized
+
+
+`Global Tensor`
+--------------------------------------------------------------
+
+Construct `Global Tensor`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+A `Global Tensor` can be created with a ``placement`` and a ``sbp``. The ``placement`` describes the physical devices of the global tensor will be allocated, and the ``sbp`` describes its distribution among these devices.
+
+::
+
+    >>>import oneflow as flow
+    >>> # Place a global tensor on cuda device of rank(process) 0 and 1
+    >>> placement = flow.placement(type="cuda", ranks=[0, 1])
+    >>> # Each rank's local data is a part data as a result of spliting global data on dim 0
+    >>> sbp = flow.sbp.split(dim=0)
+    >>> # Create a global tensor by randn
+    >>> x = flow.randn(4, 5, placement=placement, sbp=sbp)
+    >>> x.shape
+    oneflow.Size([4, 5])
+
+
+Convert `Local Tensor` to `Global Tensor`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+With ``Tensor.to_global`` interface, `Local Tensor` can create a `Global Tensor` and use that `Local Tensor` as its local component at the current node.
+
+Two `local tensors` with the shape of ``(2,5)`` are created separately on two devices. While after the ``to_global`` method, the `global tensor` with a shape of ``(4,5)`` is obtained.
+
+Code running on Node 0
+
+::
+
+    import oneflow as flow
+
+    x = flow.randn(2,5)
+    placement = flow.placement("cuda", [0,1])
+    sbp = flow.sbp.split(0)
+    x_global = x.to_global(placement=placement, sbp=sbp)
+    x_global.shape
+
+Code running on Node 1
+
+::
+
+    import oneflow as flow
+
+    x = flow.randn(2,5)
+    placement = flow.placement("cuda", [0,1])
+    sbp = flow.sbp.split(0)
+    x_global = x.to_global(placement=placement, sbp=sbp)
+    x_global.shape
+
+Redistribute `Global Tensor`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Redistributing a `Global Tensor` means moving its data to another device group (or placement), or changing its data distribution (or SBP) across the group, or both at the same time. The redistributed tensor is still a `Global Tensor`.
+
+::
+
+    >>> import oneflow as flow
+    >>> x = flow.tensor([1.0, 2.0], placement=flow.placement("cuda", ranks=[0, 1]), sbp=flow.sbp.split(0))
+    >>> y = x.to_global(placement=flow.placement("cuda", ranks=[2, 3]), sbp=flow.sbp.broadcast)
+
+According to the operator's semantics, OneFlow defines a sequence of valid input and output SBP combinations for each built-in operator. So OneFlow could automatically redistribute the `Global Tensor` to satisfy the operator's SBP requirements for its input Tensor. For example, the following code:
+
+::
+
+    >>> import oneflow as flow
+    >>> x = flow.randn(4, 4, 
+            placement=flow.placement("cuda", ranks=[0, 1]), 
+            sbp=flow.sbp.split(0))
+    >>> y = flow.randn(4, 4, 
+            placement=flow.placement("cuda", ranks=[0, 1]), 
+            sbp=flow.sbp.split(1))
+    >>> z = x + y
+
+When ``x + y`` is executed, since x is split along dimension ``0`` and y is split along dimension ``1``, their local components at each node can not be added directly, then OneFlow will automatically redistribute one of x and y to make them have the same SBP, and complete the add operation successfully.
+
+.. note ::
+    - Global Tensor can not be used in combination with DDP currently.
+    - Global Tensor requires all devices to execute at the same pace, otherwise, it may cause multi-process deadlock.
+
+Get Local Tensor from Global Tensor
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+With ``Tensor.to_local`` interface, the `Global Tensor` can return its local component at the current node.
+
+::
+
+    y = x.to_local()
+    y.is_local
+    True
+    y
+    tensor([[ 2.9186e-01, -3.9442e-01,  4.7072e-04, -3.2216e-01,  1.7788e-01],
+                [-4.5284e-01,  1.2361e-01, -3.5962e-01,  2.6651e-01,  1.2951e+00]],
+            device='cuda:0', dtype=oneflow.float32)
+
+
+DistributedDataParallel
+--------------------------------------------------------------
+
+For more information about DistributedDataParallel, see ``nn.parallel.DistributedDataParallel``
+
+The following script shows the process of using ``oneflow.nn.parallel.DistributedDataParallel`` for training data parallel: 
+
+.. code-block:: 
+
+    import oneflow as flow
+    from oneflow.nn.parallel import DistributedDataParallel as ddp
+
+    train_x = [
+        flow.tensor([[1, 2], [2, 3]], dtype=flow.float32),
+        flow.tensor([[4, 6], [3, 1]], dtype=flow.float32),
+    ]
+    train_y = [
+        flow.tensor([[8], [13]], dtype=flow.float32),
+        flow.tensor([[26], [9]], dtype=flow.float32),
+    ]
+
+
+    class Model(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.lr = 0.01
+            self.iter_count = 500
+            self.w = flow.nn.Parameter(flow.tensor([[0], [0]], dtype=flow.float32))
+
+        def forward(self, x):
+            x = flow.matmul(x, self.w)
+            return x
+
+
+    m = Model().to("cuda")
+    m = ddp(m)
+    loss = flow.nn.MSELoss(reduction="sum")
+    optimizer = flow.optim.SGD(m.parameters(), m.lr)
+
+    for i in range(0, m.iter_count):
+        rank = flow.env.get_rank()
+        x = train_x[rank].to("cuda")
+        y = train_y[rank].to("cuda")
+
+        y_pred = m(x)
+        l = loss(y_pred, y)
+        if (i + 1) % 50 == 0:
+            print(f"{i+1}/{m.iter_count} loss:{l}")
+
+        optimizer.zero_grad()
+        l.backward()
+        optimizer.step()
+
+    print(f"\nw:{m.w}")
+
+There are only two differences between the data parallelism training code and the stand-alone single-card script:
+
+- Use `DistributedDataParallel` to wrap the module object (`m = ddp(m)`)
+- Use `get_rank` to get the current device number and distribute the data to the device.
+
+Then use `launcher` to run the script, leave everything else to OneFlow, which makes distributed training as simple as stand-alone single-card training:
+
+::
+
+    python3 -m oneflow.distributed.launch --nproc_per_node 2 ./ddp_train.py
+
+
+Communication collectives
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. currentmodule:: oneflow.comm
+    
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+        all_reduce
+        all_gather
+        all_gather_into_tensor
+        all_to_all
+        broadcast
+        barrier
+        gather
+        reduce
+        reduce_scatter
+        reduce_scatter_tensor
+        recv
+        scatter
+        send
+
+Launching distributed training
+--------------------------------------------------------------
+
 .. currentmodule:: oneflow.distributed

 run commands below to see more about usage.

--- a/docs/source/distributions.rst
+++ b/docs/source/distributions.rst
+oneflow.distributions
+==================================================
+
+.. contents:: oneflow.distributions
+    :depth: 2
+    :local:
+    :class: this-will-duplicate-information-and-it-is-still-useful-here
+    :backlinks: top
+
+.. currentmodule:: oneflow.distributions
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    Distribution
+    Categorical 
--- a/docs/source/env.rst
+++ b/docs/source/env.rst
-oneflow.env
-===================================
-Environment
----------------------------------
-.. currentmodule:: oneflow
-
-.. autofunction:: oneflow.env.get_world_size
-.. autofunction:: oneflow.env.get_rank
-.. autofunction:: oneflow.env.get_local_rank
-.. autofunction:: oneflow.env.get_node_size
-.. autofunction:: oneflow.env.init_rdma
-.. autofunction:: oneflow.env.rdma_is_initialized
--- a/docs/source/environment_variables.rst
+++ b/docs/source/environment_variables.rst
+Environment Variables
+================================================
+
+OneFlow has an extensive set of environment variables to tune for specific usage.
+
+`ONEFLOW_COMM_NET_IB_HCA <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/comm_network/ibverbs/ibverbs_comm_network.cpp#L47>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+When there are multiple IB NIC(which can be checked by ``ibstatus`` on the server, the system uses the first IB NIC for comm_net communication by default.
+
+When this environment variable is set, the system will check all IB NIC and find the NIC with the corresponding name. `#5626 <https://github.com/Oneflow-Inc/oneflow/pull/5626>`_
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is empty, such as ``mlx5_0:1``、 ``mlx5_1:1``. When the port is 0, the default value is 1, representing the first port.
+
+`ONEFLOW_COMM_NET_IB_GID_INDEX <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/comm_network/ibverbs/ibverbs_comm_network.cpp#L142>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+For the query of `ibv_query_gid <https://www.ibm.com/docs/en/aix/7.2?topic=management-ibv-query-gid>`_, and 0 represents success. It often used with ``ONEFLOW_COMM_NET_IB_HCA``. GID means the Global ID, QP under RoCE network must be built by this value, instead of just using the LID as in the IB network. `#5626 <https://github.com/Oneflow-Inc/oneflow/pull/5626>`_
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is 0, representing the port index value
+
+`ONEFLOW_COMM_NET_IB_QUEUE_DEPTH <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/comm_network/ibverbs/ibverbs_qp.cpp#L44>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Queue length of jobs in IB network.
+
+This value effectively controls the size of the module without instead of using IB's default size, such as ``ONEFLOW_COMM_NET_IB_MEM_BLOCK_SIZE``.
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``1024``, receiving ``int64_t``. The system would compare with ``max_qp_wr`` (Maximum number of outstanding WR on any work queue), and take the smaller one.
+
+`ONEFLOW_COMM_NET_IB_MEM_BLOCK_SIZE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/comm_network/ibverbs/ibverbs_qp.cpp#L68>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+The size of the module read when communicating.
+
+The value can calculate the amount of module, and transmit it after encapsulation.
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``8388608`` (8M)
+
+`ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/ep/cuda/cuda_device.cpp#L59>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Represents stream, and marks Blocking synchronization in cuda. `Detailed information <https://www.cnblogs.com/1024incn/p/5891051.html>`_, `#5612 <https://github.com/Oneflow-Inc/oneflow/pull/5612>`_, `#5837 <https://github.com/Oneflow-Inc/oneflow/pull/5837>`_
+
+Values accepted
+^^^^^^^^^^^^^^^
+Define and set to ``false``, and would be ``true` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
+
+`ONEFLOW_LIBIBVERBS_PATH <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/platform/lib/ibv_wrapper.cpp#L24>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+To load the DynamicLibrary by dlopen at runtime, to find symbols of ibverbs functions by dlopen without linking during compile for better compatibility. `#4852 <https://github.com/Oneflow-Inc/oneflow/pull/4852>`_.
+
+If it failed, it will output ``libibverbs not available, ibv_fork_init skipped``, if it worked, the ``import oneflow`` will output such as ``loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1``
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is empty, but will load ``libibverbs.so.1``, ``libibverbs.so``.
+
+`ONEFLOW_DEBUG_MODE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/common/env_var/debug_mode.h#L23>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Enable ``debug`` mode, ``ONEFLOW_DEBUG`` can do.
+
+If ``debug`` mode is on, it will output more INFO level logs, different ``prototxt`` and ``dot`` to files. The automatically inserted boxing information will be printed to the log file under eager global mode.
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is empty, but will receive any string.
+
+`ONEFLOW_DRY_RUN <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job/resource_desc.cpp#L65>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Only for test running, it can generate log files like ``dot``.
+
+Exit once the test is succeed, do not try real training.
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is empty, but will receive any string.
+
+`ONEFLOW_DEBUG_KERNEL_SYNC_CHECK_NUMERICS <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/lazy/stream_context/cuda/cuda_stream_context.cpp#L66>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Only used when debugging because the performance would be affected, it could detect which op in the network appears nan or inf.
+
+It will create ``CpuCheckNumericsKernelObserver`` under ``cpu`` , and ``CudaCheckNumericsKernelObserver`` under ``cuda`` `#6052 <https://github.com/Oneflow-Inc/oneflow/pull/6052>`_ .
+
+Values accepted
+^^^^^^^^^^^^^^^
+Define and set to ``false``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
+
+`ONEFLOW_DEBUG_KERNEL_SYNC_CHECK <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job/env_global_objects_scope.cpp#L193>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Only used when debugging because the performance would be affected.
+
+It will create ``SyncCheckKernelObserver`` and will be synced after each kernel.
+
+It could be used to debug cuda errors. `#6052 <https://github.com/Oneflow-Inc/oneflow/pull/6052>`_
+
+Values accepted
+^^^^^^^^^^^^^^^
+Define and set to ``false``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
+
+`ONEFLOW_PROFILER_KERNEL_PROFILE_CUDA_MEMORY_BANDWIDTH <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/profiler/kernel.cpp#L34>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Used when generate profiler files by nsys.
+
+Profiler is only valid for lazy temporarily.
+
+It can estimate the memory bandwidth reached by kernel by counting the execution time of the GPU kernel and the size of the input and output memory, and help find potential kernels that can be optimized. `Details <https://github.com/Oneflow-Inc/oneflow/blob/02e29f9648f63a4d936cd818061e90064d027005/oneflow/core/profiler/kernel.cpp#L53>`_
+
+Values accepted
+^^^^^^^^^^^^^^^
+Define and set to ``false``. When using, the compiled package needs to enable ``BUILD_PROFILER``.
+
+`ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/profiler/kernel.cpp#L36>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+The same as above. collect `op name <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/profiler/kernel.cpp#L62>`_
+
+Values accepted
+^^^^^^^^^^^^^^^
+Define and set to ``false``. When using, the compiled package needs to enable ``BUILD_PROFILER``.
+
+`ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job/env_global_objects_scope.cpp#L199>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Only use blob_access_checker after enabling, because blob_access_checker is for correctness assurance, and closing it in some cases can increase the kernel overhead. `#5728 <https://github.com/Oneflow-Inc/oneflow/pull/5728>`_
+
+Values accepted
+^^^^^^^^^^^^^^^
+Define and set to ``false``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
+
+`ONEFLOW_KERNEL_ENABLE_CUDA_GRAPH <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/kernel/user_kernel.cpp#L692>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Takes effect under ``WITH_CUDA_GRAPHS`` and the default value is ``false``. It uses more memory, so when there's just enough memory, it won't run.
+
+Turning on CUDA_GRAPH will use up more memory CUDA Graphs support. `#5868 <https://github.com/Oneflow-Inc/oneflow/pull/5868>`_
+
+Values accepted
+^^^^^^^^^^^^^^^
+Define and set to ``false``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
+
+`ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/thread/thread.cpp#L30>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+LightActor is a new type of Actor that only handles NormalForward and similar tasks where all regst_num is 1 or tasks with only one kernel. `#5868 <https://github.com/Oneflow-Inc/oneflow/pull/5868>`_. ``export ONEFLOW_KERNEL_ENABLE_CUDA_GRAPH=1`` (Would use more memories), ``export ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE=1``, ``export ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER=1``, ``export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=1``, ``export ONEFLOW_STREAM_REUSE_CUDA_EVENT=1`` can be used together.
+
+Values accepted
+^^^^^^^^^^^^^^^
+Define and set to ``false``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
+
+`ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/thread/thread.cpp#L29>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+`#5720 <https://github.com/Oneflow-Inc/oneflow/pull/5720>`_. It is used to enable local message queue, ``oneflow.config.thread_enable_local_message_queue(True)`` is no longer used.
+
+Values accepted
+^^^^^^^^^^^^^^^
+Define and set to ``false``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
+
+`ONEFLOW_PERSISTENT_IN_STREAM_BUFFER_SIZE_BYTES <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/persistence/persistent_in_stream.cpp#L30>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Represents the size of each read from disk. `#5162 <https://github.com/Oneflow-Inc/oneflow/pull/5162>`_
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is empty. If an invalid string or negative number is entered, the default value would be ``32 * 1024``; 32KB.
+
+`ONEFLOW_DECODER_ENABLE_NVJPEG_HARDWARE_ACCELERATION <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/kernel/image_decoder_random_crop_resize_kernel.cpp#L290>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+``NVJPEG_VER_MAJOR`` need to be bigger than ``11``. It can accelerate nvjpeg hardware, warm up jpeg decoder and hw_jpeg decoder, `#5851 <https://github.com/Oneflow-Inc/oneflow/pull/5851>`_.
+
+Hardware JPEG decoder and NVIDIA nvJPEG library on NVIDIA A100 GPUs
+
+Values accepted
+^^^^^^^^^^^^^^^
+Define and set to ``true``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
+
+`ONEFLOW_SERVING_DEBUG <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/api/cpp/framework/graph.cpp#L213>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+For printing information of OneFlow Serving Debug
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_DISABLE_VIEW <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/framework/tensor_methods.cpp#L35>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+To disable view mechanism, which means op related to view would stop running.
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/auto_parallel/boxing_collector.cpp#L82>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Whether to disable Middle Node. When it is false, all inter-SBP communication is supported
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_ONE_EMBEDDING_DISABLE_NUMA_AWARE_ALLOCATION <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/embedding/full_cache.cu#L414>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Whether to disable NUMA_AWARE memory allocation when the OneEmbedding module allocates video memory.
+
+NUMA_AWARE memory allocation means that when allocating pinned host memory, the cpu close to the gpu will be considered (for example, if it is gpu 0 1, memory will be allocated on cpu0)
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_EP_CUDA_ENABLE_TF32_EXECUTION <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/ep/cuda/cuda_stream.cpp#L96>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Whether to allow CUDA to use TF32 numeric types for computation
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``true``
+
+`ONEFLOW_FUNCTOR_DISABLE_FUSED_MLP <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/functional/impl/nn_functor.cpp#L554>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Whether to disable the fused_mlp operator implemented by cublasLt in FusedMLPFunctor, if disabled, it will degenerate into a multiple matrix multiplication operation.
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_ONE_EMBEDDING_EMBEDDING_SHUFFLE_INDEPENTENT_STREAM <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job_rewriter/replace_embedding_ops_pass.cpp#L192>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Whether to put the EmbeddingShuffle of the OneEmbedding module on a separate stream for overlapping execution.
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_ONE_EMBEDDING_GRADIENT_SHUFFLE_USE_FP16 <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job_rewriter/replace_embedding_ops_pass.cpp#L209>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Whether to allow the EmbeddingGradientShuffle operator of the OneEmbedding module to use the FP16 data type in the AMP case.
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``true``
+
+`ONEFLOW_ONE_EMBEDDING_NOT_FUSE_CAST_TO_UPDATE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job_rewriter/replace_embedding_ops_pass.cpp#L260>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Whether to disable the fusion of cast type conversion and parameter update of OneEmbedding parameters into one operator in the case of AMP
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_DEBUG_KERNEL_SYNC_CHECK_NUMERICS_DUMP <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/kernel/cpu_numerics_kernel_observer.cpp#L65>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+When the value appears NaN or Inf, save the data Dump.
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_MLIR_ENABLE_IR_PRINTING <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/ir/lib/OneFlow/Passes.cpp#L768>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Control whether to print ir when running each pass when debugging
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_MLIR_STDOUT <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/ir/oneflow-extension/extension.cpp#L151>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Control whether MLIR outputs log information in the console
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_MLIR_DUMP_IR <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/ir/oneflow-extension/extension.cpp#L152>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Control whether to dump ir files
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_MLIR_ENABLE_ROUND_TRIP <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/ir/oneflow-extension/ir_pass.cpp#L157>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Control whether Oneflow Job goes into MLIR
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_KERNEL_REDUCE_SUM_USE_MATMUL <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/user/kernels/reduce_kernel.cpp#L333>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+whether to use matrix multiplication for reduce_sum
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM <https://github.com/Oneflow-Inc/oneflow/blob/dd580f21ffb6e4d23a899c7e0ac6d2bc502f3f1a/oneflow/core/job_rewriter/fuse_embedding_interaction_pass.cpp#L35>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Whether to quantify the shuffle application communication in the case of OneEmbedding multi-card
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``false``
+
+`ONEFLOW_TENSOR_BUFFER_ALIGNED_SIZE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/common/tensor_buffer.cpp#L29>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Align size when allocating TensorBuffer memory
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``1024``
+
+`ONEFLOW_TENSOR_BUFFER_POOL_THREAD_LOCAL_CACHE_SIZE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/common/tensor_buffer.cpp#L206>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Control the size of ``thread_local_cache`` in TensorBufferPool
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``64``
+
+`ONEFLOW_GRPC_MAX_MESSAGE_BYTE_SIZE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/control/ctrl_service.cpp#L45>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Set the maximum size of the gRPC transport message
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``-1``
+
+`ONEFLOW_ONE_EMBEDDING_PERSISTENT_TABLE_CAPACITY_HINT <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/embedding/persistent_table.cpp#L410>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Control the initial capacity of the PersistentTable of OneEmbedding to avoid frequent expansion
+
+Values accepted
+^^^^^^^^^^^^^^^
+OneEmbedding will calculate according to the actual situation, and users can also choose to configure a larger capacity.
+
+`ONEFLOW_ONE_EMBEDDING_PERSISTENT_TABLE_NUM_WORKERS <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/embedding/persistent_table.cpp#L435>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+The number of threads used for reading and writing the PersistentTable of OneEmbedding
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``4``
+
+`ONEFLOW_EP_CUDA_CONST_BUFFER_ELEMENT_COUNT <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/ep/cuda/cuda_device.cpp#L62>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Specify the size of the all zero and all one buffers on the CUDA device.
+
+This buffer can be used with matrix multiplication to implement operations such as reduce_sum
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``1024x1024``
+
+`OMP_NUM_THREADS <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job/env_global_objects_scope.cpp#L96>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Set the number of threads used by OMP
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value will be generated by specific `computational logic <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job/env_global_objects_scope.cpp#L106-L108>`_.
+
+`SBP_INFER_RULE_TAG <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/operator/operator.cpp#L718>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Specify SBP derivation rules
+
+Values accepted
+^^^^^^^^^^^^^^^
+When the default vaule is ``1`` , select the SBP that satisfies the producer or the SBP with the smallest cost as much as possible.
+
+When the default value is ``2``, select the SBP that matches the most.
+
+When the default value is ``3``, select the SBP with the smallest cost.
+
+`ONEFLOW_TENSOR_BUFFER_GROWTH_FACTOR <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/common/tensor_buffer.cpp#L35>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Control the growth factor of TensorBuffer
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``1.0``
+
+`ONEFLOW_TENSOR_BUFFER_SHRINK_FACTOR <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/common/tensor_buffer.cpp#L41>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Controls the shrink factor of TensorBuffer
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``0.7``
+
+`ONEFLOW_TENSOR_BUFFER_POOL_SIZE_FACTOR <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/common/tensor_buffer.cpp#L200>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Controls the size factor of TensorBuffer
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``2.0``
+
+`AUTO_PARALLEL_TRANSFER_COST <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/framework/sbp_infer_util.cpp#L544>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Control the size of the automatic parallel transfer cost
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``1.65e8``
+
+
+`ONEFLOW_DEBUG_PASS <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job/job_build_and_infer_ctx.cpp#L991>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Pass names and print job before and after a specific pass, such as ``export ONEFLOW_DEBUG_PASS="FuseAddToOutputPass``.
+
+Or ALL, print job before and after a specific pass, such as ``export ONEFLOW_DEBUG_PASS="ALL"``.
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``empty``
+
+`ONEFLOW_PROFILER_HOST_THREAD_NAME_PREFIX <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/profiler/profiler.cpp#L39>`_
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Add a prefix to the name of the named host thread in the profiling context to facilitate sorting in the visualization tool (nsight)
+
+Values accepted
+^^^^^^^^^^^^^^^
+The default value is ``empty``
--- a/docs/source/functional.rst
+++ b/docs/source/functional.rst
-oneflow.nn.functional
-===========================================
-Functional operations for neural networks
-------------------------------------------
-.. currentmodule:: oneflow.nn.functional
-.. autofunction:: conv1d
-.. autofunction:: conv2d
-.. autofunction:: conv3d
-.. autofunction:: conv_transpose1d
-.. autofunction:: conv_transpose2d
-.. autofunction:: conv_transpose3d
-.. autofunction:: adaptive_avg_pool1d
-.. autofunction:: adaptive_avg_pool2d
-.. autofunction:: adaptive_avg_pool3d
-.. autofunction:: relu
-.. autofunction:: hardsigmoid
-.. autofunction:: hardshrink
-.. autofunction:: hardswish
-.. autofunction:: hardtanh
-.. autofunction:: normalize
-.. autofunction:: layer_norm
-.. autofunction:: leaky_relu
-.. autofunction:: elu
-.. autofunction:: celu
-.. autofunction:: selu
-.. autofunction:: sigmoid
-.. autofunction:: pad
-.. autofunction:: prelu
-.. autofunction:: logsigmoid 
-.. autofunction:: log_softmax
-.. autofunction:: gelu
-.. autofunction:: glu
-.. autofunction:: softsign
-.. autofunction:: softmax 
-.. autofunction:: softplus
-.. autofunction:: tanh 
-.. autofunction:: threshold
-.. autofunction:: softshrink 
-.. autofunction:: silu
-.. autofunction:: mish
-.. autofunction:: one_hot
-.. autofunction:: triplet_margin_loss
-.. autofunction:: dropout 
-.. autofunction:: affine_grid
-.. autofunction:: grid_sample
-.. autofunction:: interpolate
-.. autofunction:: ctc_greedy_decoder
-.. autofunction:: sparse_softmax_cross_entropy
-.. autofunction:: embedding
-.. autofunction:: linear
-.. autofunction:: cosine_similarity
-.. autofunction:: cross_entropy
--- a/docs/source/graph.rst
+++ b/docs/source/graph.rst
 oneflow.nn.Graph
 ============================================================
 Base class for running neural networks in Static Graph Mode.
+
+Currently, there are two main ways to run models in deep learning frameworks, namely dynamic graphs and static graphs , which are also conventionally referred to as :ref:`dynamic graph` and :ref:`static graph` in OneFlow.
+
+Both approaches have their advantages and disadvantages, and OneFlow provides support for both approaches, with Eager mode being the default.
+
+Generally speaking, dynamic graphs are easier to use and static graphs have more performance advantages. :class:`oneflow.nn.Graph` module is provided by OneFlow to allow users to build static graphs and train models with Eager-like programming conventions.
+
+.. contents:: oneflow.nn.Graph
+    :depth: 2
+    :local:
+    :class: this-will-duplicate-information-and-it-is-still-useful-here
+    :backlinks: top
+
+.. _dynamic graph:
+
+Eager Mode to Static Graph Mode
 ------------------------------------------------------------
-.. currentmodule:: oneflow.nn
-.. autoclass:: oneflow.nn.Graph
-    :members: __init__,
-            build,
-            __call__,
-            add_optimizer,
-            set_grad_scaler,
-            state_dict,
-            load_state_dict,
-            name,
-            debug,
-            __repr__,
-    :member-order: bysource
-
-
-
-.. autoclass:: oneflow.nn.graph.graph_config.GraphConfig
-    :members: enable_amp,
-            enable_zero,
-            allow_fuse_model_update_ops,
-            allow_fuse_add_to_output,
-            allow_fuse_cast_scale,
-            set_gradient_accumulation_steps,
-            enable_cudnn_conv_heuristic_search_algo,
-            enable_straighten_algorithm,
-    :member-order: bysource
-
-
-
-.. autoclass:: oneflow.nn.graph.block_config.BlockConfig
-    :members: stage_id,
-            set_stage,
-            activation_checkpointing,
-    :member-order: bysource
+
+OneFlow runs in Eager mode by default.
+
+OneFlow's nn.Graph is programmed in a style very similar to Eager Mode, so it is possible to make small changes and get large performance gains.
+
+The following script shows the process of building a neural network in eager mode using the interface under ``oneflow.nn`` :
+
+
+.. code-block:: 
+
+    import oneflow as flow
+    import oneflow.nn as nn
+
+    class ModuleMyLinear(nn.Module):
+        def __init__(self, in_features, out_features):
+            super().__init__()
+            self.weight = nn.Parameter(flow.randn(in_features, out_features))
+            self.bias = nn.Parameter(flow.randn(out_features))
+
+        def forward(self, input):
+            return flow.matmul(input, self.weight) + self.bias
+
+    linear_model = ModuleMyLinear(4, 3)
+
+
+Eager ``nn.Module`` can be reused by ``nn.Graph``. The above script for eager mode can be changed to static Graph mode by adding just a few lines of code, which consists of the following steps:
+
+- Define your customized graph as a subclass of ``nn.Graph``
+- At the beginning of __init__. Call super().__init__() to let OneFlow do the necessary initialization of the Graph
+- Reuse the ``nn.Module`` object in Eager mode in __init__ (self.model = model)
+- Describe the computation in the ``build`` method
+- Instantiate your graph then call it.
+
+.. code-block:: 
+
+    class GraphMyLinear(nn.Graph):
+        def __init__(self):
+            super().__init__()
+            self.model = linear_model
+
+        def build(self, input):
+            return self.model(input)
+
+    graph_mylinear = GraphMyLinear()
+    input = flow.randn(1, 4)
+    out = graph_mylinear(input)
+    print(out)
+
+    tensor([[-0.3298, -3.7907,  0.1661]], dtype=oneflow.float32)
+
+.. _static graph:
+
+Static Graph Mode
+------------------------------------------------------------
+
+
+Constructing a Graph
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Base class for training or evaluating a neural network in static graph mode.
+
+.. currentmodule:: oneflow.nn.Graph
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    __init__
+    build
+    add_optimizer
+    set_grad_scaler
+
+Executing a Graph
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Call a nn.Graph instance to run a customized graph.
+
+.. currentmodule:: oneflow.nn.Graph
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    __call__
+
+
+
+Config options on a Graph
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Optimization options of a nn.Graph.
+
+.. currentmodule:: oneflow.nn.graph.graph_config.GraphConfig
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    enable_amp
+    enable_zero
+    allow_fuse_model_update_ops
+    allow_fuse_add_to_output
+    allow_fuse_cast_scale
+    set_gradient_accumulation_steps
+    enable_cudnn_conv_heuristic_search_algo
+    enable_straighten_algorithm
+    enable_compress_memory
+    
+
+Config options on a GraphModule
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+GraphModule is the graph representation of a nn.Module in a nn.Graph.
+
+When an nn.Module is added into an nn.Graph, it is wrapped into a ProxyModule. The ProxyModule has a GraphModule inside it.
+You can get and set the GraphModule to enable graph optimization on the nn.Module.
+
+.. currentmodule:: oneflow.nn.graph.graph_block.GraphModule
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    set_stage
+    activation_checkpointing
+
+Save & Load a Model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. currentmodule:: oneflow.nn.Graph
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    state_dict
+    load_state_dict
+
+
+Debug a Graph
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    __repr__
+    debug
+    name
+
+

--- a/docs/source/hub.rst
+++ b/docs/source/hub.rst
+oneflow.hub
+===================================
+
+.. The documentation is referenced from: 
+   https://pytorch.org/docs/1.10/hub.html
+
+Oneflow Hub is a pre-trained model repository designed to facilitate research reproducibility.
+
+Publishing models
+-----------------
+
+Oneflow Hub supports publishing pre-trained models(model definitions and pre-trained weights)
+to a github repository by adding a simple ``hubconf.py`` file;
+
+``hubconf.py`` can have multiple entrypoints. Each entrypoint is defined as a python function
+(example: a pre-trained model you want to publish).
+
+::
+
+    def entrypoint_name(*args, **kwargs):
+        # args & kwargs are optional, for models which take positional/keyword arguments.
+        ...
+
+How to implement an entrypoint?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Here is a code snippet specifies an entrypoint for ``resnet18`` model if we expand
+the implementation in ``Oneflow-Inc/vision/hubconf.py``.
+In most case importing the right function in ``hubconf.py`` is sufficient. Here we
+just want to use the expanded version as an example to show how it works.
+You can see the full script in
+`Oneflow-Inc/vision repo <https://github.com/Oneflow-Inc/vision/blob/master/hubconf.py>`_
+
+::
+
+    dependencies = ['oneflow']
+    from flowvision.models.resnet import resnet18 as _resnet18
+
+    # resnet18 is the name of entrypoint
+    def resnet18(pretrained=False, **kwargs):
+        """ # This docstring shows up in hub.help()
+        Resnet18 model
+        pretrained (bool): kwargs, load pretrained weights into the model
+        """
+        # Call the model, load pretrained weights
+        model = _resnet18(pretrained=pretrained, **kwargs)
+        return model
+
+
+- ``dependencies`` variable is a **list** of package names required to **load** the model. Note this might
+  be slightly different from dependencies required for training a model.
+- ``args`` and ``kwargs`` are passed along to the real callable function.
+- Docstring of the function works as a help message. It explains what does the model do and what
+  are the allowed positional/keyword arguments. It's highly recommended to add a few examples here.
+- Entrypoint function can either return a model(nn.module), or auxiliary tools to make the user workflow smoother, e.g. tokenizers.
+- Callables prefixed with underscore are considered as helper functions which won't show up in :func:`oneflow.hub.list()`.
+- Pretrained weights can either be stored locally in the github repo, or loadable by
+  :func:`oneflow.hub.load_state_dict_from_url()`. If less than 2GB, it's recommended to attach it to a `project release <https://help.github.com/en/articles/distributing-large-binaries>`_
+  and use the url from the release.
+  In the example above ``flowvision.models.resnet.resnet18`` handles ``pretrained``, alternatively you can put the following logic in the entrypoint definition.
+
+::
+
+    if pretrained:
+        # For checkpoint saved in local github repo, e.g. <RELATIVE_PATH_TO_CHECKPOINT>=weights/save.pth
+        dirname = os.path.dirname(__file__)
+        checkpoint = os.path.join(dirname, <RELATIVE_PATH_TO_CHECKPOINT>)
+        state_dict = oneflow.load(checkpoint)
+        model.load_state_dict(state_dict)
+
+        # For checkpoint saved elsewhere
+        checkpoint = 'https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip'
+        model.load_state_dict(oneflow.hub.load_state_dict_from_url(checkpoint, progress=False))
+
+
+Important Notice
+^^^^^^^^^^^^^^^^
+
+- The published models should be at least in a branch/tag. It can't be a random commit.
+
+
+Loading models from Hub
+-----------------------
+
+OneFlow Hub provides convenient APIs to explore all available models in hub
+through :func:`oneflow.hub.list()`, show docstring and examples through
+:func:`oneflow.hub.help()` and load the pre-trained models using
+:func:`oneflow.hub.load()`.
+
+
+.. automodule:: oneflow.hub
+
+.. autofunction:: list
+
+.. autofunction:: help
+
+.. autofunction:: load
+
+.. autofunction:: download_url_to_file
+
+.. autofunction:: load_state_dict_from_url
+
+Running a loaded model:
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Note that ``*args`` and ``**kwargs`` in :func:`oneflow.hub.load()` are used to
+**instantiate** a model. After you have loaded a model, how can you find out
+what you can do with the model?
+A suggested workflow is
+
+- ``dir(model)`` to see all available methods of the model.
+- ``help(model.foo)`` to check what arguments ``model.foo`` takes to run
+
+To help users explore without referring to documentation back and forth, we strongly
+recommend repo owners make function help messages clear and succinct. It's also helpful
+to include a minimal working example.
+
+Where are my downloaded models saved?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The locations are used in the order of
+
+- Calling ``hub.set_dir(<PATH_TO_HUB_DIR>)``
+- ``$ONEFLOW_HOME/hub``, if environment variable ``ONEFLOW_HOME`` is set.
+- ``$XDG_CACHE_HOME/oneflow/hub``, if environment variable ``XDG_CACHE_HOME`` is set.
+- ``~/.cache/oneflow/hub``
+
+.. autofunction:: get_dir
+
+.. autofunction:: set_dir
+
+Caching logic
+^^^^^^^^^^^^^
+
+By default, we don't clean up files after loading it. Hub uses the cache by default if it already exists in the
+directory returned by :func:`~oneflow.hub.get_dir()`.
+
+Users can force a reload by calling ``hub.load(..., force_reload=True)``. This will delete
+the existing github folder and downloaded weights, reinitialize a fresh download. This is useful
+when updates are published to the same branch, users can keep up with the latest release.
+
+
+Known limitations:
+^^^^^^^^^^^^^^^^^^
+Oneflow hub works by importing the package as if it was installed. There are some side effects
+introduced by importing in Python. For example, you can see new items in Python caches
+``sys.modules`` and ``sys.path_importer_cache`` which is normal Python behavior.
+This also means that you may have import errors when importing different models
+from different repos, if the repos have the same sub-package names (typically, a
+``model`` subpackage). A workaround for these kinds of import errors is to
+remove the offending sub-package from the ``sys.modules`` dict; more details can
+be found in `this github issue
+<https://github.com/pytorch/hub/issues/243#issuecomment-942403391>`_.
+
+A known limitation that is worth mentioning here: users **CANNOT** load two different branches of
+the same repo in the **same python process**. It's just like installing two packages with the
+same name in Python, which is not good. Cache might join the party and give you surprises if you
+actually try that. Of course it's totally fine to load them in separate processes.
--- a/docs/source/image.rst
+++ b/docs/source/image.rst
@@ -3,9 +3,14 @@ oneflow.nn.image
 Image operations for neural networks
 --------------------------------------
 .. currentmodule:: oneflow.nn.image
-.. automodule:: oneflow.nn.image
-    :members: Resize,
-        batch_align,
-        decode,
-        flip,
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Resize
+    batch_align
+    decode
+    flip
    normalize
+
+