Commit a715222c authored by yuguo's avatar yuguo
Browse files

0.9.1-rocm

parent f262efc9
......@@ -45,7 +45,7 @@ file(
"${PROJECT_SOURCE_DIR}/oneflow/user/*.*"
"${PROJECT_SOURCE_DIR}/oneflow/api/*.*"
"${PROJECT_SOURCE_DIR}/oneflow/maybe/*.*"
"${PROJECT_SOURCE_DIR}/oneflow/extension/python/*.*")
"${PROJECT_SOURCE_DIR}/oneflow/extension/*.*")
foreach(oneflow_single_file ${oneflow_all_src})
# Verify whether this file is for other platforms
......@@ -80,6 +80,21 @@ foreach(oneflow_single_file ${oneflow_all_src})
if(BUILD_CUDA)
list(APPEND of_all_obj_cc ${oneflow_single_file})
endif()
if(BUILD_ROCM)
if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/(core|user)/.*\\.cu$")
get_filename_component(oneflow_single_file_hip_cpp_dir ${oneflow_single_file} DIRECTORY)
get_filename_component(oneflow_single_file_hip_cpp ${oneflow_single_file} NAME_WE)
add_custom_command(
OUTPUT "${oneflow_single_file_hip_cpp_dir}/${oneflow_single_file_hip_cpp}_hip.cpp"
COMMAND ${CMAKE_COMMAND} -E copy "${oneflow_single_file}" "${oneflow_single_file_hip_cpp_dir}/${oneflow_single_file_hip_cpp}_hip.cpp"
DEPENDS "${oneflow_single_file}"
)
list(APPEND of_all_obj_cc ${oneflow_single_file_hip_cpp_dir}/${oneflow_single_file_hip_cpp}_hip.cpp)
endif()
if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/(core|user)/.*\\.cuh$")
list(APPEND of_all_obj_cc ${oneflow_single_file})
endif()
endif()
set(group_this ON)
endif()
......@@ -96,8 +111,7 @@ foreach(oneflow_single_file ${oneflow_all_src})
set(group_this ON)
endif()
if("${oneflow_single_file}" MATCHES
"^${PROJECT_SOURCE_DIR}/oneflow/extension/python/.*\\.(h|cpp)$")
if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/extension/.*\\.(c|h|cpp)$")
list(APPEND of_pyext_obj_cc ${oneflow_single_file})
set(group_this ON)
endif()
......@@ -105,7 +119,7 @@ foreach(oneflow_single_file ${oneflow_all_src})
if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/(core|user|maybe)/.*\\.cpp$")
if("${oneflow_single_file}" MATCHES
"^${PROJECT_SOURCE_DIR}/oneflow/(core|user|maybe)/.*_test\\.cpp$")
"^${PROJECT_SOURCE_DIR}/oneflow/(core|user|maybe|thread)/.*_test\\.cpp$")
# test file
list(APPEND of_all_test_cc ${oneflow_single_file})
elseif(APPLE AND "${oneflow_single_file}" MATCHES
......@@ -136,6 +150,7 @@ add_custom_target(
of_format
COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_license_format.py -i
${CMAKE_CURRENT_SOURCE_DIR}/oneflow --fix
--exclude="oneflow/user/kernels/fmha_flash_attention"
COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_license_format.py -i
${ONEFLOW_PYTHON_DIR} --fix --exclude="oneflow/include" --exclude="oneflow/core"
COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_clang_format.py --source_dir
......@@ -254,20 +269,22 @@ if("${LLVM_MONO_REPO_URL}" STREQUAL
"https://github.com/llvm/llvm-project/archive/6a9bbd9f20dcd700e28738788bb63a160c6c088c.zip"
OR "${LLVM_MONO_REPO_URL}" STREQUAL
"https://github.com/llvm/llvm-project/archive/32805e60c9de1f82887cd2af30d247dcabd2e1d3.zip"
OR "${LLVM_MONO_REPO_URL}" STREQUAL
"https://github.com/llvm/llvm-project/archive/6d6268dcbf0f48e43f6f9fe46b3a28c29ba63c7d.zip"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "f2f17229cf21049663b8ef4f2b6b8062"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "6b7c6506d5922de9632c8ff012b2f945"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e0ea669a9f0872d35bffda5ec6c5ac6f"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "241a333828bba1efa35aff4c4fc2ce87"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "075fbfdf06cb3f02373ea44971af7b03"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e412dc61159b5e929b0c94e44b11feb2")
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e412dc61159b5e929b0c94e44b11feb2"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "334997b4879aba15d9323a732356cf2a")
unset(LLVM_MONO_REPO_URL CACHE)
unset(LLVM_MONO_REPO_MD5 CACHE)
endif()
set(LLVM_MONO_REPO_URL
"https://github.com/llvm/llvm-project/archive/6d6268dcbf0f48e43f6f9fe46b3a28c29ba63c7d.zip"
set(LLVM_MONO_REPO_URL "https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-15.0.6.zip"
CACHE STRING "")
use_mirror(VARIABLE LLVM_MONO_REPO_URL URL ${LLVM_MONO_REPO_URL})
set(LLVM_MONO_REPO_MD5 "334997b4879aba15d9323a732356cf2a" CACHE STRING "")
set(LLVM_MONO_REPO_MD5 "1ccc00accc87a1a5d42a275d6e31cd8c" CACHE STRING "")
set(ONEFLOW_BUILD_ROOT_DIR "${PROJECT_BINARY_DIR}")
add_subdirectory(${PROJECT_SOURCE_DIR}/oneflow/ir)
if(WITH_MLIR)
......@@ -306,9 +323,9 @@ elseif(UNIX)
${oneflow_third_party_libs}
${EXTERNAL_TARGETS}
-Wl,--no-whole-archive
-Wl,--as-needed
-ldl
-lrt
-Wl,--version-script ${PROJECT_SOURCE_DIR}/version_script.lds)
-lrt)
if(BUILD_CUDA)
target_link_libraries(oneflow CUDA::cudart_static)
endif()
......@@ -317,6 +334,43 @@ elseif(WIN32)
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /WHOLEARCHIVE:oneflow")
endif()
if (BUILD_ROCM)
# AMD compiler fails to compile these three files with '-O1/2/3'.
# The value of `COMPILE_OPTIONS` target property is added after CMAKE_<LANG>_FLAGS_<CONFIG>,
# so '-O0' will override '-O1/2/3'.
set_source_files_properties(${PROJECT_SOURCE_DIR}/oneflow/user/kernels/median_with_indices_kernel_hip.cpp
${PROJECT_SOURCE_DIR}/oneflow/user/kernels/radix_sort_top_k_kernel_hip.cpp
${PROJECT_SOURCE_DIR}/oneflow/user/kernels/arg_sort_kernel_hip.cpp
#${PROJECT_SOURCE_DIR}/oneflow/core/ep/cuda/primitive/broadcast_elementwise_binary_math1_hip.cpp
PROPERTIES COMPILE_OPTIONS "-O0")
endif()
if(BUILD_CUDA)
string(JOIN "," CUDA_REAL_ARCHS ${CUDA_REAL_ARCHS_LIST})
set_source_files_properties(${PROJECT_SOURCE_DIR}/oneflow/core/hardware/cuda_device_descriptor.cpp
PROPERTIES COMPILE_FLAGS "-DCUDA_REAL_ARCHS=\"${CUDA_REAL_ARCHS}\"")
endif()
if(BUILD_CUDA AND WITH_CUTLASS)
if(CUDA_VERSION VERSION_GREATER_EQUAL "10.1")
add_definitions(-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1)
endif()
set_property(
SOURCE ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/fused_multi_head_attention_inference_kernel.cu
APPEND PROPERTY INCLUDE_DIRECTORIES
${CUTLASS_INSTALL_DIR}/examples/41_fused_multi_head_attention)
set_property(SOURCE ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/fused_glu_kernel.cu APPEND
PROPERTY INCLUDE_DIRECTORIES ${CUTLASS_INSTALL_DIR}/examples/45_dual_gemm)
if("${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA")
set_property(
SOURCE
${PROJECT_SOURCE_DIR}/oneflow/user/kernels/fused_multi_head_attention_inference_kernel.cu
APPEND
PROPERTY COMPILE_OPTIONS "--use_fast_math")
endif()
endif()
# oneflow api common
if(BUILD_PYTHON OR BUILD_CPP_API)
file(GLOB_RECURSE of_api_common_files ${PROJECT_SOURCE_DIR}/oneflow/api/common/*.h
......@@ -343,6 +397,8 @@ if(BUILD_PYTHON)
add_dependencies(of_pyext_obj oneflow)
pybind11_add_module(oneflow_internal ${PYBIND11_SRCS} ${of_pybind_obj_cc} ${PYBIND_REGISTRY_CC})
set_property(TARGET oneflow_internal APPEND PROPERTY BUILD_RPATH "\$ORIGIN/../nvidia/cublas/lib")
set_property(TARGET oneflow_internal APPEND PROPERTY BUILD_RPATH "\$ORIGIN/../nvidia/cudnn/lib")
set_compile_options_to_oneflow_target(oneflow_internal)
set_property(TARGET oneflow_internal PROPERTY CXX_VISIBILITY_PRESET "default")
add_dependencies(oneflow_internal of_functional_obj of_functional_tensor_obj of_op_schema)
......@@ -419,6 +475,9 @@ if(BUILD_TESTING)
oneflow_add_test(oneflow_testexe SRCS ${of_all_test_cc} TEST_NAME oneflow_test)
target_link_libraries(oneflow_testexe ${of_libs} ${oneflow_third_party_libs} glog::glog
${oneflow_test_libs})
if(WITH_MLIR)
target_link_libraries(oneflow_testexe MLIROneFlowExtension)
endif()
endif()
if(BUILD_CPP_API)
......@@ -524,6 +583,10 @@ if(BUILD_CPP_API)
if(BUILD_CUDA)
checkdirandappendslash(DIR ${NCCL_LIBRARY_DIR} OUTPUT NCCL_LIBRARY_DIR_APPENDED)
list(APPEND LIBONEFLOW_THIRD_PARTY_DIRS ${NCCL_LIBRARY_DIR_APPENDED})
if(WITH_CUTLASS)
checkdirandappendslash(DIR ${CUTLASS_LIBRARY_DIR} OUTPUT CUTLASS_LIBRARY_DIR_APPENDED)
list(APPEND LIBONEFLOW_THIRD_PARTY_DIRS ${CUTLASS_LIBRARY_DIR_APPENDED})
endif()
endif()
install(
......@@ -555,6 +618,7 @@ if(BUILD_CPP_API)
llvm-PerfectShuffle
llvm-tblgen
mlir-tblgen
mlir-pdll
obj2yaml
oneflow_tblgen
yaml-bench
......
......@@ -38,7 +38,13 @@ set(ONEFLOW_OP_GROUPS
"TRIGONOMETRIC"
"UNARY"
"UPSAMPLE"
"ONE_EMBEDDING")
"ONE_EMBEDDING"
"LINEAR_ALGEBRA"
"SYSTEM")
if(WITH_MLIR)
list(APPEND ONEFLOW_OP_GROUPS "MLIR_JIT")
endif(WITH_MLIR)
foreach(OP_GROUP_NAME IN LISTS ONEFLOW_OP_GROUPS)
list(APPEND ONEFLOW_SCHEMA_TABLEGEN_FLAGS "-DGET_ONEFLOW_${OP_GROUP_NAME}_OP_DEFINITIONS")
endforeach()
......
......@@ -137,7 +137,8 @@ endif()
list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${RE2_INCLUDE_DIR})
if(BUILD_CUDA)
if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
# Always use third_party/cub for Clang CUDA in case of compatibility issues
if("${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA" AND CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
if(CMAKE_CXX_STANDARD LESS 14)
add_definitions(-DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT)
add_definitions(-DCUB_IGNORE_DEPRECATED_CPP11)
......@@ -150,6 +151,7 @@ if(BUILD_CUDA)
list(APPEND oneflow_third_party_dependencies cub_copy_headers_to_destination)
endif()
include(nccl)
include(cutlass)
list(APPEND oneflow_third_party_libs ${NCCL_LIBRARIES})
list(APPEND oneflow_third_party_libs ${CUDNN_LIBRARIES})
......@@ -159,12 +161,19 @@ if(BUILD_CUDA)
list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUDNN_INCLUDE_DIRS} ${CUB_INCLUDE_DIR}
${NCCL_INCLUDE_DIR})
endif()
if(WITH_CUTLASS)
list(APPEND oneflow_third_party_dependencies cutlass)
list(APPEND oneflow_third_party_dependencies cutlass_copy_examples_to_destination)
list(APPEND oneflow_third_party_libs ${CUTLASS_LIBRARIES})
list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUTLASS_INCLUDE_DIR})
endif()
endif()
if (BUILD_ROCM)
# Find rocm packages
find_package(hip)
find_package(hipfft)
find_package(hipblas)
find_package(hipcub)
find_package(hiprand)
......@@ -173,11 +182,31 @@ if (BUILD_ROCM)
find_package(rccl)
add_definitions(-DWITH_ROCM)
add_definitions(-D__HIP_PLATFORM_HCC__)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
add_definitions(-D__HIPCC__)
if (BUILD_ROCM_GRAPHS)
add_definitions(-DWITH_ROCM_GRAPHS)
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ -D__HIPCC__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ -D__HIPCC__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --gpu-max-threads-per-block=1024")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-macro-redefined")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-inconsistent-missing-override")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-exceptions")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-shift-count-negative")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-shift-count-overflow")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-duplicate-decl-specifier")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-implicit-int-float-conversion")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pass-failed")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-gpu-rdc")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mcmodel=large")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -mcmodel=large")
list(APPEND oneflow_third_party_libs hip::device)
list(APPEND oneflow_third_party_libs hip::hipfft)
list(APPEND oneflow_third_party_libs roc::hipblas)
list(APPEND oneflow_third_party_libs hip::hipcub)
list(APPEND oneflow_third_party_libs roc::rocrand)
......@@ -186,16 +215,17 @@ if (BUILD_ROCM)
link_directories(${ROCM_PATH}/rccl/lib)
list(APPEND oneflow_third_party_libs rccl)
list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${HIP_INCLUDE_DIRS}
${HIPFFT_INCLUDE_DIRS}
${HIPBLAS_INCLUDE_DIRS}
${HIPCUB_INCLUDE_DIRS}
"${ROCM_PATH}/hiprand/include"
"${ROCM_PATH}/rocrand/include"
"${ROCM_PATH}/roctracer/include"
${MIOPEN_INCLUDE_DIRS}
${RCCL_INCLUDE_DIRS})
message(STATUS "ONEFLOW_THIRD_PARTY_INCLUDE_DIRS: ${ONEFLOW_THIRD_PARTY_INCLUDE_DIRS}")
endif()
if(BUILD_RDMA)
if(UNIX)
include(CheckIncludeFiles)
......
include(ExternalProject)
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(WITH_CUTLASS_INIT OFF)
else()
set(WITH_CUTLASS_INIT ON)
endif()
set(WITH_CUTLASS ${WITH_CUTLASS_INIT} CACHE BOOL "")
if(WITH_CUTLASS)
add_definitions(-DWITH_CUTLASS)
find_package(Threads)
set(CUTLASS_PROJECT cutlass)
set(CUTLASS_INSTALL_DIR ${THIRD_PARTY_DIR}/cutlass)
set(CUTLASS_INCLUDE_DIR ${CUTLASS_INSTALL_DIR}/include CACHE PATH "" FORCE)
set(CUTLASS_LIBRARY_DIR ${CUTLASS_INSTALL_DIR}/lib CACHE PATH "" FORCE)
set(CUTLASS_LIBRARIES ${CUTLASS_LIBRARY_DIR}/libcutlass.so)
set(CUTLASS_SOUREC_DIR ${CMAKE_CURRENT_BINARY_DIR}/cutlass/src/cutlass/)
foreach(arch ${CUDA_REAL_ARCHS_LIST})
if(arch GREATER_EQUAL 70)
list(APPEND CUTLASS_REAL_ARCHS ${arch})
endif()
endforeach()
if(THIRD_PARTY)
ExternalProject_Add(
${CUTLASS_PROJECT}
PREFIX cutlass
URL ${CUTLASS_URL}
URL_MD5 ${CUTLASS_MD5}
UPDATE_COMMAND ""
BUILD_BYPRODUCTS ${CUTLASS_LIBRARIES}
CMAKE_ARGS -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
-DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_DEBUG:STRING=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_CXX_FLAGS_RELEASE:STRING=${CMAKE_CXX_FLAGS_RELEASE}
CMAKE_CACHE_ARGS
-DCMAKE_CUDA_COMPILER:STRING=${CUDAToolkit_NVCC_EXECUTABLE}
-DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER}
-DCMAKE_CXX_COMPILER_LAUNCHER:STRING=${CMAKE_CXX_COMPILER_LAUNCHER}
-DCMAKE_INSTALL_PREFIX:PATH=${CUTLASS_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${CUTLASS_LIBRARY_DIR}
-DCMAKE_INSTALL_MESSAGE:STRING=${CMAKE_INSTALL_MESSAGE}
-DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
-DCUTLASS_LIBRARY_OPERATIONS:STRING=conv2d
-DCUTLASS_LIBRARY_KERNELS:STRING=simt_hfprop_*,tensorop_f16_*fprop,tensorop_h*fprop
-DCUTLASS_ENABLE_EXAMPLES:BOOL=OFF
-DCUTLASS_ENABLE_PROFILER:BOOL=OFF
-DCUTLASS_ENABLE_LIBRARY:BOOL=ON
-DCUTLASS_NVCC_ARCHS:STRING=${CUTLASS_REAL_ARCHS}
-DCUTLASS_ENABLE_TESTS:BOOL=OFF
-DCUTLASS_UNITY_BUILD_ENABLED:BOOL=ON
-DCUTLASS_LIBRARY_DEBUG_POSTFIX:STRING=
-DCUTLASS_NVCC_EMBED_PTX:BOOL=OFF)
add_custom_target(cutlass_copy_examples_to_destination DEPENDS cutlass)
set(CUTLASS_SOURCE_EXAMPLES_DIR ${CUTLASS_SOUREC_DIR}/examples)
set(CUTLASS_INSTALL_EXAMPLES_FILES
"41_fused_multi_head_attention/iterators/make_residual_last.h"
"41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h"
"41_fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h"
"41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h"
"41_fused_multi_head_attention/mma_from_smem.h"
"41_fused_multi_head_attention/epilogue_rescale_output.h"
"41_fused_multi_head_attention/attention_scaling_coefs_updater.h"
"41_fused_multi_head_attention/gemm_kernel_utils.h"
"41_fused_multi_head_attention/fmha_grouped_problem_visitor.h"
"41_fused_multi_head_attention/fmha_grouped.h"
"41_fused_multi_head_attention/default_fmha_grouped.h"
"41_fused_multi_head_attention/epilogue_pipelined.h"
"41_fused_multi_head_attention/epilogue_thread_apply_logsumexp.h"
"41_fused_multi_head_attention/kernel_forward.h"
"41_fused_multi_head_attention/gemm/custom_mma_multistage.h"
"41_fused_multi_head_attention/gemm/custom_mma_base.h"
"41_fused_multi_head_attention/gemm/custom_mma.h"
"41_fused_multi_head_attention/gemm/custom_mma_pipelined.h"
"41_fused_multi_head_attention/find_default_mma.h"
"41_fused_multi_head_attention/debug_utils.h"
"45_dual_gemm/test_run.h"
"45_dual_gemm/kernel/dual_gemm.h"
"45_dual_gemm/device/dual_gemm.h"
"45_dual_gemm/dual_gemm_run.h"
"45_dual_gemm/thread/left_silu_and_mul.h"
"45_dual_gemm/threadblock/dual_mma_multistage.h"
"45_dual_gemm/threadblock/dual_epilogue.h"
"45_dual_gemm/threadblock/dual_mma_base.h")
foreach(filename ${CUTLASS_INSTALL_EXAMPLES_FILES})
add_custom_command(
TARGET cutlass_copy_examples_to_destination
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CUTLASS_SOURCE_EXAMPLES_DIR}/${filename}
${CUTLASS_INSTALL_DIR}/examples/${filename})
endforeach()
endif(THIRD_PARTY)
endif(WITH_CUTLASS)
......@@ -7,12 +7,9 @@ set(EIGEN_URL https://github.com/Oneflow-Inc/eigen-git-mirror/archive/refs/tags/
set(EIGEN_MD5 a23cb70e12d1bf9b09cb28af51bc26ae)
use_mirror(VARIABLE EIGEN_URL URL ${EIGEN_URL})
add_definitions(-DEIGEN_NO_AUTOMATIC_RESIZING)
if(BUILD_CUDA)
add_definitions(-DEIGEN_USE_GPU)
endif()
add_definitions(-DEIGEN_NO_MALLOC)
#add_definitions(-DEIGEN_NO_AUTOMATIC_RESIZING -DEIGEN_NO_MALLOC -DEIGEN_USE_GPU)
if(THIRD_PARTY)
......
......@@ -34,11 +34,36 @@ else()
set(NCCL_INCLUDE_DIR ${NCCL_INSTALL_DIR}/include)
set(NCCL_LIBRARY_DIR ${NCCL_INSTALL_DIR}/lib)
# Versions 2.13 and above may cause deadlocks
if(CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
set(NCCL_URL https://github.com/NVIDIA/nccl/archive/refs/tags/v2.15.1-1.tar.gz)
set(NCCL_MD5 37b787ff8934cd9374b4612f663c17fa)
else()
set(NCCL_URL https://github.com/NVIDIA/nccl/archive/refs/tags/v2.12.10-1.tar.gz)
set(NCCL_MD5 bdb91f80b78c99831f09ca8bb28a1032)
endif()
use_mirror(VARIABLE NCCL_URL URL ${NCCL_URL})
list(APPEND NCCL_LIBRARIES ${NCCL_LIBRARY_DIR}/${NCCL_LIBRARY_NAME})
set(NCCL_ARCHS_LIST ${CUDA_REAL_ARCHS_LIST})
# remove redundant archs, https://github.com/NVIDIA/nccl/blob/cb111f764a6d46370f24f75101d6b219bb2dda54/makefiles/common.mk#L28
if("70" IN_LIST NCCL_ARCHS_LIST AND "75" IN_LIST NCCL_ARCHS_LIST)
list(REMOVE_ITEM NCCL_ARCHS_LIST "75")
endif()
if("80" IN_LIST NCCL_ARCHS_LIST AND "86" IN_LIST NCCL_ARCHS_LIST)
list(REMOVE_ITEM NCCL_ARCHS_LIST "86")
endif()
if("80" IN_LIST NCCL_ARCHS_LIST AND "89" IN_LIST NCCL_ARCHS_LIST)
list(REMOVE_ITEM NCCL_ARCHS_LIST "89")
endif()
foreach(arch ${NCCL_ARCHS_LIST})
string(APPEND NCCL_GENCODE "-gencode=arch=compute_${arch},code=sm_${arch} ")
endforeach()
if(THIRD_PARTY)
include(ProcessorCount)
......@@ -47,11 +72,12 @@ else()
nccl
PREFIX nccl
URL ${NCCL_URL}
URL_MD5 bdb91f80b78c99831f09ca8bb28a1032
URL_MD5 ${NCCL_MD5}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
BUILD_COMMAND make -j${PROC_NUM} src.build CUDA_HOME=${CUDATOOLKIT_BIN_ROOT}
NVCC_GENCODE=${NCCL_GENCODE}
INSTALL_COMMAND make src.install PREFIX=${NCCL_INSTALL_DIR}
BUILD_BYPRODUCTS ${NCCL_LIBRARIES})
......
......@@ -14,3 +14,4 @@ dataclasses; python_version<"3.7"
cmakelang==0.6.13
pytest-xdist
rich
portalocker
Auto Parallelism
====================================================
As the scale of deep-learning models grows larger and larger, distributed training,
or parallelism, is needed. Data parallelism and model parallelism has been designed
to speed up the training and solve memory issues.
In oneflow, SBP signature enables users to configure parallelism policy easily.
However, users still need to specify the SBP property for each operator, or most of them.
Users might spend a couple of days digging into the detail of parallelism and get a
low throughput just because of a slight mistake in the configuration of SBP signature.
.. note::
It only works on :doc:`graph` mode.
Our strength
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
To get rid of all those configurations for SBP signatures, we developed auto parallelism.
Still, configurations of placement are necessary and we have not supported auto placement
yet. If you read this paragraph before you rush into any SBP stuff, then congratulation,
you do not need to learn SBPs. You can start writing your code as you did under CPU mode.
Our auto parallelism would generate a fast strategy customized for your specific models,
the size of parameters, and the number of available GPUs.
How to use auto parallelism?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You just need to simply enable the configuration settings in the model
of :doc:`graph` .
Example::
import oneflow as flow
class SubclassGraph(flow.nn.Graph):
def __init__(self):
super().__init__() # MUST be called
# auto parallelism configuration
self.config.enable_auto_parallel(True)
# other configurations about auto parallelism
# ......
def build(self):
pass
.. warning::
If you enable auto parallelism, OneFlow will take care of the SBP configurations
of operators except for explicit ``to_global`` functions.
Configuration API for auto parallelism
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. currentmodule:: oneflow.nn.graph.graph_config.GraphConfig
.. autosummary::
:toctree: generated
:nosignatures:
enable_auto_parallel
enable_auto_parallel_ignore_user_sbp_config
set_auto_parallel_computation_cost_ratio
set_auto_parallel_wait_time
enable_auto_parallel_trunk_algo
enable_auto_parallel_sbp_collector
oneflow.autograd
================================================
Functions and classes for autograd.
---------------------------------------------------
====================================================
.. The documentation is referenced from:
https://pytorch.org/docs/1.10/autograd.html
``oneflow.autograd`` provides classes and functions implementing automatic differentiation of arbitrary scalar
valued functions. It requires minimal changes to the existing code - you only need to declare ``Tensor`` s
for which gradients should be computed with the ``requires_grad=True`` keyword. As of now, we only support
autograd for floating point ``Tensor`` types ( half, float, double and bfloat16).
.. currentmodule:: oneflow.autograd
.. autosummary::
:toctree: generated
:nosignatures:
backward
grad
Locally disabling gradient computation
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autosummary::
:toctree: generated
:nosignatures:
no_grad
enable_grad
set_grad_enabled
inference_mode
.. TODO(wyg): uncomment this after aligning accumulate grad
.. Default gradient layouts
.. ^^^^^^^^^^^^^^^^^^^^^^^^
.. A ``param.grad`` is accumulated by replacing ``.grad`` with a
.. new tensor ``.grad + new grad`` during :func:`oneflow.autograd.backward()` or
.. :func:`oneflow.Tensor.backward()`.
In-place operations on Tensors
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Supporting in-place operations in autograd is a hard matter, and we discourage
their use in most cases. Autograd's aggressive buffer freeing and reuse makes
it very efficient and there are very few occasions when in-place operations
actually lower memory usage by any significant amount. Unless you're operating
under heavy memory pressure, you might never need to use them.
Tensor autograd functions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autosummary::
:nosignatures:
oneflow.Tensor.grad
oneflow.Tensor.requires_grad
oneflow.Tensor.is_leaf
oneflow.Tensor.backward
oneflow.Tensor.detach
oneflow.Tensor.register_hook
oneflow.Tensor.retain_grad
Function
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autoclass:: Function
.. currentmodule:: oneflow.autograd
.. autoclass:: oneflow.autograd.Function
:members: apply,
:special-members: __call__,
.. autosummary::
:toctree: generated
:nosignatures:
Function.forward
Function.backward
Function.apply
Context method mixins
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
When creating a new :class:`Function`, the following methods are available to `ctx`.
.. currentmodule:: oneflow._oneflow_internal.autograd.Function
.. autosummary::
:toctree: generated
:nosignatures:
.. automodule:: oneflow.autograd
:members: grad,
backward,
FunctionCtx.mark_non_differentiable
FunctionCtx.save_for_backward
FunctionCtx.saved_tensors
oneflow.comm
===================================
oneflow communication function
----------------------------------
.. currentmodule:: oneflow.comm
.. automodule:: oneflow.comm
:members: all_reduce,
all_gather,
broadcast,
scatter,
all_to_all,
reduce,
gather,
reduce_scatter,
send,
recv,
barrier,
......@@ -45,9 +45,14 @@ extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.napoleon",
"recommonmark",
"sphinx.ext.autosummary",
"sphinx_copybutton",
]
# build the templated autosummary files
autosummary_generate = True
numpydoc_show_class_members = False
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
......@@ -107,7 +112,6 @@ html_static_path = ["_static"]
#
# html_sidebars = {}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
......
oneflow.cuda
===================================
ONEFLOW.CUDA
----------------------------------
.. The documentation is referenced from: https://pytorch.org/docs/1.10/cuda.html.
.. currentmodule:: oneflow.cuda
.. automodule:: oneflow.cuda
:members: is_available,
device_count,
current_device,
set_device,
synchronize,
manual_seed_all,
manual_seed,
empty_cache,
HalfTensor,
FloatTensor,
DoubleTensor,
BoolTensor,
ByteTensor,
CharTensor,
IntTensor,
LongTensor,
\ No newline at end of file
.. autosummary::
:toctree: generated
:nosignatures:
is_available
device_count
current_device
set_device
synchronize
get_device_properties
get_device_capability
get_device_name
.. note::
The :attr:`current_device` returns local rank as device index. It is different from the 'torch.current_device()' in PyTorch.
Random Number Generator
-------------------------
.. autosummary::
:toctree: generated
:nosignatures:
manual_seed_all
manual_seed
GPU tensor
-----------------------------
.. autosummary::
:toctree: generated
:nosignatures:
HalfTensor
FloatTensor
DoubleTensor
BoolTensor
ByteTensor
CharTensor
IntTensor
LongTensor
Memory management
-----------------------------
.. autosummary::
:toctree: generated
:nosignatures:
empty_cache
\ No newline at end of file
oneflow.distributed
=========================================================
.. note ::
Please refer to `OneFlow Distributed Overview <https://docs.oneflow.org/master/parallelism/01_introduction.html>`__
for a brief introduction to all features related to distributed training.
OneFlow provides two ways to accomplish `Distributed Training`:
- The first way is that users are recommended to use OneFlow's global Tensor for distributed training. Global Tensor regards the computing cluster as a supercomputing device, allowing users to write distributed training code just like in a single-machine environment.
- OneFlow also provides a DDP(DistributedDataParallel) module aligned with PyTorch. DDP has been well-known and widely used in data parallelism by the majority of PyTorch users. Also see `PyTorch DDP introduction <https://pytorch.org/docs/1.10/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_.
Basic
-------------------------------
When you start distributed training in OneFlow, the following functions can be used.
.. currentmodule:: oneflow.env
.. autosummary::
:toctree: generated
:nosignatures:
get_world_size
get_rank
get_local_rank
get_node_size
init_rdma
rdma_is_initialized
`Global Tensor`
--------------------------------------------------------------
Construct `Global Tensor`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
A `Global Tensor` can be created with a ``placement`` and a ``sbp``. The ``placement`` describes the physical devices of the global tensor will be allocated, and the ``sbp`` describes its distribution among these devices.
::
>>>import oneflow as flow
>>> # Place a global tensor on cuda device of rank(process) 0 and 1
>>> placement = flow.placement(type="cuda", ranks=[0, 1])
>>> # Each rank's local data is a part data as a result of spliting global data on dim 0
>>> sbp = flow.sbp.split(dim=0)
>>> # Create a global tensor by randn
>>> x = flow.randn(4, 5, placement=placement, sbp=sbp)
>>> x.shape
oneflow.Size([4, 5])
Convert `Local Tensor` to `Global Tensor`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
With ``Tensor.to_global`` interface, `Local Tensor` can create a `Global Tensor` and use that `Local Tensor` as its local component at the current node.
Two `local tensors` with the shape of ``(2,5)`` are created separately on two devices. While after the ``to_global`` method, the `global tensor` with a shape of ``(4,5)`` is obtained.
Code running on Node 0
::
import oneflow as flow
x = flow.randn(2,5)
placement = flow.placement("cuda", [0,1])
sbp = flow.sbp.split(0)
x_global = x.to_global(placement=placement, sbp=sbp)
x_global.shape
Code running on Node 1
::
import oneflow as flow
x = flow.randn(2,5)
placement = flow.placement("cuda", [0,1])
sbp = flow.sbp.split(0)
x_global = x.to_global(placement=placement, sbp=sbp)
x_global.shape
Redistribute `Global Tensor`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Redistributing a `Global Tensor` means moving its data to another device group (or placement), or changing its data distribution (or SBP) across the group, or both at the same time. The redistributed tensor is still a `Global Tensor`.
::
>>> import oneflow as flow
>>> x = flow.tensor([1.0, 2.0], placement=flow.placement("cuda", ranks=[0, 1]), sbp=flow.sbp.split(0))
>>> y = x.to_global(placement=flow.placement("cuda", ranks=[2, 3]), sbp=flow.sbp.broadcast)
According to the operator's semantics, OneFlow defines a sequence of valid input and output SBP combinations for each built-in operator. So OneFlow could automatically redistribute the `Global Tensor` to satisfy the operator's SBP requirements for its input Tensor. For example, the following code:
::
>>> import oneflow as flow
>>> x = flow.randn(4, 4,
placement=flow.placement("cuda", ranks=[0, 1]),
sbp=flow.sbp.split(0))
>>> y = flow.randn(4, 4,
placement=flow.placement("cuda", ranks=[0, 1]),
sbp=flow.sbp.split(1))
>>> z = x + y
When ``x + y`` is executed, since x is split along dimension ``0`` and y is split along dimension ``1``, their local components at each node can not be added directly, then OneFlow will automatically redistribute one of x and y to make them have the same SBP, and complete the add operation successfully.
.. note ::
- Global Tensor can not be used in combination with DDP currently.
- Global Tensor requires all devices to execute at the same pace, otherwise, it may cause multi-process deadlock.
Get Local Tensor from Global Tensor
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
With ``Tensor.to_local`` interface, the `Global Tensor` can return its local component at the current node.
::
y = x.to_local()
y.is_local
True
y
tensor([[ 2.9186e-01, -3.9442e-01, 4.7072e-04, -3.2216e-01, 1.7788e-01],
[-4.5284e-01, 1.2361e-01, -3.5962e-01, 2.6651e-01, 1.2951e+00]],
device='cuda:0', dtype=oneflow.float32)
DistributedDataParallel
--------------------------------------------------------------
For more information about DistributedDataParallel, see ``nn.parallel.DistributedDataParallel``
The following script shows the process of using ``oneflow.nn.parallel.DistributedDataParallel`` for training data parallel:
.. code-block::
import oneflow as flow
from oneflow.nn.parallel import DistributedDataParallel as ddp
train_x = [
flow.tensor([[1, 2], [2, 3]], dtype=flow.float32),
flow.tensor([[4, 6], [3, 1]], dtype=flow.float32),
]
train_y = [
flow.tensor([[8], [13]], dtype=flow.float32),
flow.tensor([[26], [9]], dtype=flow.float32),
]
class Model(flow.nn.Module):
def __init__(self):
super().__init__()
self.lr = 0.01
self.iter_count = 500
self.w = flow.nn.Parameter(flow.tensor([[0], [0]], dtype=flow.float32))
def forward(self, x):
x = flow.matmul(x, self.w)
return x
m = Model().to("cuda")
m = ddp(m)
loss = flow.nn.MSELoss(reduction="sum")
optimizer = flow.optim.SGD(m.parameters(), m.lr)
for i in range(0, m.iter_count):
rank = flow.env.get_rank()
x = train_x[rank].to("cuda")
y = train_y[rank].to("cuda")
y_pred = m(x)
l = loss(y_pred, y)
if (i + 1) % 50 == 0:
print(f"{i+1}/{m.iter_count} loss:{l}")
optimizer.zero_grad()
l.backward()
optimizer.step()
print(f"\nw:{m.w}")
There are only two differences between the data parallelism training code and the stand-alone single-card script:
- Use `DistributedDataParallel` to wrap the module object (`m = ddp(m)`)
- Use `get_rank` to get the current device number and distribute the data to the device.
Then use `launcher` to run the script, leave everything else to OneFlow, which makes distributed training as simple as stand-alone single-card training:
::
python3 -m oneflow.distributed.launch --nproc_per_node 2 ./ddp_train.py
Communication collectives
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. currentmodule:: oneflow.comm
.. autosummary::
:toctree: generated
:nosignatures:
all_reduce
all_gather
all_gather_into_tensor
all_to_all
broadcast
barrier
gather
reduce
reduce_scatter
reduce_scatter_tensor
recv
scatter
send
Launching distributed training
--------------------------------------------------------------
.. currentmodule:: oneflow.distributed
run commands below to see more about usage.
......
oneflow.distributions
==================================================
.. contents:: oneflow.distributions
:depth: 2
:local:
:class: this-will-duplicate-information-and-it-is-still-useful-here
:backlinks: top
.. currentmodule:: oneflow.distributions
.. autosummary::
:toctree: generated
:nosignatures:
:template: classtemplate.rst
Distribution
Categorical
oneflow.env
===================================
Environment
----------------------------------
.. currentmodule:: oneflow
.. autofunction:: oneflow.env.get_world_size
.. autofunction:: oneflow.env.get_rank
.. autofunction:: oneflow.env.get_local_rank
.. autofunction:: oneflow.env.get_node_size
.. autofunction:: oneflow.env.init_rdma
.. autofunction:: oneflow.env.rdma_is_initialized
Environment Variables
================================================
OneFlow has an extensive set of environment variables to tune for specific usage.
`ONEFLOW_COMM_NET_IB_HCA <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/comm_network/ibverbs/ibverbs_comm_network.cpp#L47>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
When there are multiple IB NIC(which can be checked by ``ibstatus`` on the server, the system uses the first IB NIC for comm_net communication by default.
When this environment variable is set, the system will check all IB NIC and find the NIC with the corresponding name. `#5626 <https://github.com/Oneflow-Inc/oneflow/pull/5626>`_
Values accepted
^^^^^^^^^^^^^^^
The default value is empty, such as ``mlx5_0:1``、 ``mlx5_1:1``. When the port is 0, the default value is 1, representing the first port.
`ONEFLOW_COMM_NET_IB_GID_INDEX <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/comm_network/ibverbs/ibverbs_comm_network.cpp#L142>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
For the query of `ibv_query_gid <https://www.ibm.com/docs/en/aix/7.2?topic=management-ibv-query-gid>`_, and 0 represents success. It often used with ``ONEFLOW_COMM_NET_IB_HCA``. GID means the Global ID, QP under RoCE network must be built by this value, instead of just using the LID as in the IB network. `#5626 <https://github.com/Oneflow-Inc/oneflow/pull/5626>`_
Values accepted
^^^^^^^^^^^^^^^
The default value is 0, representing the port index value
`ONEFLOW_COMM_NET_IB_QUEUE_DEPTH <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/comm_network/ibverbs/ibverbs_qp.cpp#L44>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Queue length of jobs in IB network.
This value effectively controls the size of the module without instead of using IB's default size, such as ``ONEFLOW_COMM_NET_IB_MEM_BLOCK_SIZE``.
Values accepted
^^^^^^^^^^^^^^^
The default value is ``1024``, receiving ``int64_t``. The system would compare with ``max_qp_wr`` (Maximum number of outstanding WR on any work queue), and take the smaller one.
`ONEFLOW_COMM_NET_IB_MEM_BLOCK_SIZE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/comm_network/ibverbs/ibverbs_qp.cpp#L68>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
The size of the module read when communicating.
The value can calculate the amount of module, and transmit it after encapsulation.
Values accepted
^^^^^^^^^^^^^^^
The default value is ``8388608`` (8M)
`ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/ep/cuda/cuda_device.cpp#L59>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Represents stream, and marks Blocking synchronization in cuda. `Detailed information <https://www.cnblogs.com/1024incn/p/5891051.html>`_, `#5612 <https://github.com/Oneflow-Inc/oneflow/pull/5612>`_, `#5837 <https://github.com/Oneflow-Inc/oneflow/pull/5837>`_
Values accepted
^^^^^^^^^^^^^^^
Define and set to ``false``, and would be ``true` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
`ONEFLOW_LIBIBVERBS_PATH <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/platform/lib/ibv_wrapper.cpp#L24>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
To load the DynamicLibrary by dlopen at runtime, to find symbols of ibverbs functions by dlopen without linking during compile for better compatibility. `#4852 <https://github.com/Oneflow-Inc/oneflow/pull/4852>`_.
If it failed, it will output ``libibverbs not available, ibv_fork_init skipped``, if it worked, the ``import oneflow`` will output such as ``loaded library: /usr/lib/x86_64-linux-gnu/libibverbs.so.1``
Values accepted
^^^^^^^^^^^^^^^
The default value is empty, but will load ``libibverbs.so.1``, ``libibverbs.so``.
`ONEFLOW_DEBUG_MODE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/common/env_var/debug_mode.h#L23>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Enable ``debug`` mode, ``ONEFLOW_DEBUG`` can do.
If ``debug`` mode is on, it will output more INFO level logs, different ``prototxt`` and ``dot`` to files. The automatically inserted boxing information will be printed to the log file under eager global mode.
Values accepted
^^^^^^^^^^^^^^^
The default value is empty, but will receive any string.
`ONEFLOW_DRY_RUN <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job/resource_desc.cpp#L65>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Only for test running, it can generate log files like ``dot``.
Exit once the test is succeed, do not try real training.
Values accepted
^^^^^^^^^^^^^^^
The default value is empty, but will receive any string.
`ONEFLOW_DEBUG_KERNEL_SYNC_CHECK_NUMERICS <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/lazy/stream_context/cuda/cuda_stream_context.cpp#L66>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Only used when debugging because the performance would be affected, it could detect which op in the network appears nan or inf.
It will create ``CpuCheckNumericsKernelObserver`` under ``cpu`` , and ``CudaCheckNumericsKernelObserver`` under ``cuda`` `#6052 <https://github.com/Oneflow-Inc/oneflow/pull/6052>`_ .
Values accepted
^^^^^^^^^^^^^^^
Define and set to ``false``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
`ONEFLOW_DEBUG_KERNEL_SYNC_CHECK <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job/env_global_objects_scope.cpp#L193>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Only used when debugging because the performance would be affected.
It will create ``SyncCheckKernelObserver`` and will be synced after each kernel.
It could be used to debug cuda errors. `#6052 <https://github.com/Oneflow-Inc/oneflow/pull/6052>`_
Values accepted
^^^^^^^^^^^^^^^
Define and set to ``false``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
`ONEFLOW_PROFILER_KERNEL_PROFILE_CUDA_MEMORY_BANDWIDTH <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/profiler/kernel.cpp#L34>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Used when generate profiler files by nsys.
Profiler is only valid for lazy temporarily.
It can estimate the memory bandwidth reached by kernel by counting the execution time of the GPU kernel and the size of the input and output memory, and help find potential kernels that can be optimized. `Details <https://github.com/Oneflow-Inc/oneflow/blob/02e29f9648f63a4d936cd818061e90064d027005/oneflow/core/profiler/kernel.cpp#L53>`_
Values accepted
^^^^^^^^^^^^^^^
Define and set to ``false``. When using, the compiled package needs to enable ``BUILD_PROFILER``.
`ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/profiler/kernel.cpp#L36>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
The same as above. collect `op name <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/profiler/kernel.cpp#L62>`_
Values accepted
^^^^^^^^^^^^^^^
Define and set to ``false``. When using, the compiled package needs to enable ``BUILD_PROFILER``.
`ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job/env_global_objects_scope.cpp#L199>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Only use blob_access_checker after enabling, because blob_access_checker is for correctness assurance, and closing it in some cases can increase the kernel overhead. `#5728 <https://github.com/Oneflow-Inc/oneflow/pull/5728>`_
Values accepted
^^^^^^^^^^^^^^^
Define and set to ``false``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
`ONEFLOW_KERNEL_ENABLE_CUDA_GRAPH <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/kernel/user_kernel.cpp#L692>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Takes effect under ``WITH_CUDA_GRAPHS`` and the default value is ``false``. It uses more memory, so when there's just enough memory, it won't run.
Turning on CUDA_GRAPH will use up more memory CUDA Graphs support. `#5868 <https://github.com/Oneflow-Inc/oneflow/pull/5868>`_
Values accepted
^^^^^^^^^^^^^^^
Define and set to ``false``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
`ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/thread/thread.cpp#L30>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
LightActor is a new type of Actor that only handles NormalForward and similar tasks where all regst_num is 1 or tasks with only one kernel. `#5868 <https://github.com/Oneflow-Inc/oneflow/pull/5868>`_. ``export ONEFLOW_KERNEL_ENABLE_CUDA_GRAPH=1`` (Would use more memories), ``export ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE=1``, ``export ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER=1``, ``export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=1``, ``export ONEFLOW_STREAM_REUSE_CUDA_EVENT=1`` can be used together.
Values accepted
^^^^^^^^^^^^^^^
Define and set to ``false``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
`ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/thread/thread.cpp#L29>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
`#5720 <https://github.com/Oneflow-Inc/oneflow/pull/5720>`_. It is used to enable local message queue, ``oneflow.config.thread_enable_local_message_queue(True)`` is no longer used.
Values accepted
^^^^^^^^^^^^^^^
Define and set to ``false``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
`ONEFLOW_PERSISTENT_IN_STREAM_BUFFER_SIZE_BYTES <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/persistence/persistent_in_stream.cpp#L30>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Represents the size of each read from disk. `#5162 <https://github.com/Oneflow-Inc/oneflow/pull/5162>`_
Values accepted
^^^^^^^^^^^^^^^
The default value is empty. If an invalid string or negative number is entered, the default value would be ``32 * 1024``; 32KB.
`ONEFLOW_DECODER_ENABLE_NVJPEG_HARDWARE_ACCELERATION <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/kernel/image_decoder_random_crop_resize_kernel.cpp#L290>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
``NVJPEG_VER_MAJOR`` need to be bigger than ``11``. It can accelerate nvjpeg hardware, warm up jpeg decoder and hw_jpeg decoder, `#5851 <https://github.com/Oneflow-Inc/oneflow/pull/5851>`_.
Hardware JPEG decoder and NVIDIA nvJPEG library on NVIDIA A100 GPUs
Values accepted
^^^^^^^^^^^^^^^
Define and set to ``true``, and would be ``true`` only when the value is ``1``, ``true``, ``yes``, ``on`` and ``y``.
`ONEFLOW_SERVING_DEBUG <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/api/cpp/framework/graph.cpp#L213>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
For printing information of OneFlow Serving Debug
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_DISABLE_VIEW <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/framework/tensor_methods.cpp#L35>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
To disable view mechanism, which means op related to view would stop running.
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_BOXING_DISABLE_MIDDLE_NODE_AND_CHECK <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/auto_parallel/boxing_collector.cpp#L82>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Whether to disable Middle Node. When it is false, all inter-SBP communication is supported
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_ONE_EMBEDDING_DISABLE_NUMA_AWARE_ALLOCATION <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/embedding/full_cache.cu#L414>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Whether to disable NUMA_AWARE memory allocation when the OneEmbedding module allocates video memory.
NUMA_AWARE memory allocation means that when allocating pinned host memory, the cpu close to the gpu will be considered (for example, if it is gpu 0 1, memory will be allocated on cpu0)
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_EP_CUDA_ENABLE_TF32_EXECUTION <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/ep/cuda/cuda_stream.cpp#L96>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Whether to allow CUDA to use TF32 numeric types for computation
Values accepted
^^^^^^^^^^^^^^^
The default value is ``true``
`ONEFLOW_FUNCTOR_DISABLE_FUSED_MLP <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/functional/impl/nn_functor.cpp#L554>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Whether to disable the fused_mlp operator implemented by cublasLt in FusedMLPFunctor, if disabled, it will degenerate into a multiple matrix multiplication operation.
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_ONE_EMBEDDING_EMBEDDING_SHUFFLE_INDEPENTENT_STREAM <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job_rewriter/replace_embedding_ops_pass.cpp#L192>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Whether to put the EmbeddingShuffle of the OneEmbedding module on a separate stream for overlapping execution.
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_ONE_EMBEDDING_GRADIENT_SHUFFLE_USE_FP16 <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job_rewriter/replace_embedding_ops_pass.cpp#L209>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Whether to allow the EmbeddingGradientShuffle operator of the OneEmbedding module to use the FP16 data type in the AMP case.
Values accepted
^^^^^^^^^^^^^^^
The default value is ``true``
`ONEFLOW_ONE_EMBEDDING_NOT_FUSE_CAST_TO_UPDATE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job_rewriter/replace_embedding_ops_pass.cpp#L260>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Whether to disable the fusion of cast type conversion and parameter update of OneEmbedding parameters into one operator in the case of AMP
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_DEBUG_KERNEL_SYNC_CHECK_NUMERICS_DUMP <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/kernel/cpu_numerics_kernel_observer.cpp#L65>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
When the value appears NaN or Inf, save the data Dump.
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_MLIR_ENABLE_IR_PRINTING <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/ir/lib/OneFlow/Passes.cpp#L768>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Control whether to print ir when running each pass when debugging
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_MLIR_STDOUT <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/ir/oneflow-extension/extension.cpp#L151>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Control whether MLIR outputs log information in the console
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_MLIR_DUMP_IR <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/ir/oneflow-extension/extension.cpp#L152>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Control whether to dump ir files
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_MLIR_ENABLE_ROUND_TRIP <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/ir/oneflow-extension/ir_pass.cpp#L157>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Control whether Oneflow Job goes into MLIR
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_KERNEL_REDUCE_SUM_USE_MATMUL <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/user/kernels/reduce_kernel.cpp#L333>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
whether to use matrix multiplication for reduce_sum
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_ONE_EMBEDDING_ENABLE_QUANTIZED_COMM <https://github.com/Oneflow-Inc/oneflow/blob/dd580f21ffb6e4d23a899c7e0ac6d2bc502f3f1a/oneflow/core/job_rewriter/fuse_embedding_interaction_pass.cpp#L35>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Whether to quantify the shuffle application communication in the case of OneEmbedding multi-card
Values accepted
^^^^^^^^^^^^^^^
The default value is ``false``
`ONEFLOW_TENSOR_BUFFER_ALIGNED_SIZE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/common/tensor_buffer.cpp#L29>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Align size when allocating TensorBuffer memory
Values accepted
^^^^^^^^^^^^^^^
The default value is ``1024``
`ONEFLOW_TENSOR_BUFFER_POOL_THREAD_LOCAL_CACHE_SIZE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/common/tensor_buffer.cpp#L206>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Control the size of ``thread_local_cache`` in TensorBufferPool
Values accepted
^^^^^^^^^^^^^^^
The default value is ``64``
`ONEFLOW_GRPC_MAX_MESSAGE_BYTE_SIZE <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/control/ctrl_service.cpp#L45>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Set the maximum size of the gRPC transport message
Values accepted
^^^^^^^^^^^^^^^
The default value is ``-1``
`ONEFLOW_ONE_EMBEDDING_PERSISTENT_TABLE_CAPACITY_HINT <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/embedding/persistent_table.cpp#L410>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Control the initial capacity of the PersistentTable of OneEmbedding to avoid frequent expansion
Values accepted
^^^^^^^^^^^^^^^
OneEmbedding will calculate according to the actual situation, and users can also choose to configure a larger capacity.
`ONEFLOW_ONE_EMBEDDING_PERSISTENT_TABLE_NUM_WORKERS <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/embedding/persistent_table.cpp#L435>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
The number of threads used for reading and writing the PersistentTable of OneEmbedding
Values accepted
^^^^^^^^^^^^^^^
The default value is ``4``
`ONEFLOW_EP_CUDA_CONST_BUFFER_ELEMENT_COUNT <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/ep/cuda/cuda_device.cpp#L62>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Specify the size of the all zero and all one buffers on the CUDA device.
This buffer can be used with matrix multiplication to implement operations such as reduce_sum
Values accepted
^^^^^^^^^^^^^^^
The default value is ``1024x1024``
`OMP_NUM_THREADS <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job/env_global_objects_scope.cpp#L96>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Set the number of threads used by OMP
Values accepted
^^^^^^^^^^^^^^^
The default value will be generated by specific `computational logic <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job/env_global_objects_scope.cpp#L106-L108>`_.
`SBP_INFER_RULE_TAG <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/operator/operator.cpp#L718>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Specify SBP derivation rules
Values accepted
^^^^^^^^^^^^^^^
When the default vaule is ``1`` , select the SBP that satisfies the producer or the SBP with the smallest cost as much as possible.
When the default value is ``2``, select the SBP that matches the most.
When the default value is ``3``, select the SBP with the smallest cost.
`ONEFLOW_TENSOR_BUFFER_GROWTH_FACTOR <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/common/tensor_buffer.cpp#L35>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Control the growth factor of TensorBuffer
Values accepted
^^^^^^^^^^^^^^^
The default value is ``1.0``
`ONEFLOW_TENSOR_BUFFER_SHRINK_FACTOR <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/common/tensor_buffer.cpp#L41>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Controls the shrink factor of TensorBuffer
Values accepted
^^^^^^^^^^^^^^^
The default value is ``0.7``
`ONEFLOW_TENSOR_BUFFER_POOL_SIZE_FACTOR <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/common/tensor_buffer.cpp#L200>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Controls the size factor of TensorBuffer
Values accepted
^^^^^^^^^^^^^^^
The default value is ``2.0``
`AUTO_PARALLEL_TRANSFER_COST <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/framework/sbp_infer_util.cpp#L544>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Control the size of the automatic parallel transfer cost
Values accepted
^^^^^^^^^^^^^^^
The default value is ``1.65e8``
`ONEFLOW_DEBUG_PASS <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/job/job_build_and_infer_ctx.cpp#L991>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Pass names and print job before and after a specific pass, such as ``export ONEFLOW_DEBUG_PASS="FuseAddToOutputPass``.
Or ALL, print job before and after a specific pass, such as ``export ONEFLOW_DEBUG_PASS="ALL"``.
Values accepted
^^^^^^^^^^^^^^^
The default value is ``empty``
`ONEFLOW_PROFILER_HOST_THREAD_NAME_PREFIX <https://github.com/Oneflow-Inc/oneflow/blob/v0.9.0/oneflow/core/profiler/profiler.cpp#L39>`_
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Add a prefix to the name of the named host thread in the profiling context to facilitate sorting in the visualization tool (nsight)
Values accepted
^^^^^^^^^^^^^^^
The default value is ``empty``
oneflow.nn.functional
===========================================
Functional operations for neural networks
-------------------------------------------
.. currentmodule:: oneflow.nn.functional
.. autofunction:: conv1d
.. autofunction:: conv2d
.. autofunction:: conv3d
.. autofunction:: conv_transpose1d
.. autofunction:: conv_transpose2d
.. autofunction:: conv_transpose3d
.. autofunction:: adaptive_avg_pool1d
.. autofunction:: adaptive_avg_pool2d
.. autofunction:: adaptive_avg_pool3d
.. autofunction:: relu
.. autofunction:: hardsigmoid
.. autofunction:: hardshrink
.. autofunction:: hardswish
.. autofunction:: hardtanh
.. autofunction:: normalize
.. autofunction:: layer_norm
.. autofunction:: leaky_relu
.. autofunction:: elu
.. autofunction:: celu
.. autofunction:: selu
.. autofunction:: sigmoid
.. autofunction:: pad
.. autofunction:: prelu
.. autofunction:: logsigmoid
.. autofunction:: log_softmax
.. autofunction:: gelu
.. autofunction:: glu
.. autofunction:: softsign
.. autofunction:: softmax
.. autofunction:: softplus
.. autofunction:: tanh
.. autofunction:: threshold
.. autofunction:: softshrink
.. autofunction:: silu
.. autofunction:: mish
.. autofunction:: one_hot
.. autofunction:: triplet_margin_loss
.. autofunction:: dropout
.. autofunction:: affine_grid
.. autofunction:: grid_sample
.. autofunction:: interpolate
.. autofunction:: ctc_greedy_decoder
.. autofunction:: sparse_softmax_cross_entropy
.. autofunction:: embedding
.. autofunction:: linear
.. autofunction:: cosine_similarity
.. autofunction:: cross_entropy
oneflow.nn.Graph
============================================================
Base class for running neural networks in Static Graph Mode.
Currently, there are two main ways to run models in deep learning frameworks, namely dynamic graphs and static graphs , which are also conventionally referred to as :ref:`dynamic graph` and :ref:`static graph` in OneFlow.
Both approaches have their advantages and disadvantages, and OneFlow provides support for both approaches, with Eager mode being the default.
Generally speaking, dynamic graphs are easier to use and static graphs have more performance advantages. :class:`oneflow.nn.Graph` module is provided by OneFlow to allow users to build static graphs and train models with Eager-like programming conventions.
.. contents:: oneflow.nn.Graph
:depth: 2
:local:
:class: this-will-duplicate-information-and-it-is-still-useful-here
:backlinks: top
.. _dynamic graph:
Eager Mode to Static Graph Mode
------------------------------------------------------------
.. currentmodule:: oneflow.nn
.. autoclass:: oneflow.nn.Graph
:members: __init__,
build,
__call__,
add_optimizer,
set_grad_scaler,
state_dict,
load_state_dict,
name,
debug,
__repr__,
:member-order: bysource
.. autoclass:: oneflow.nn.graph.graph_config.GraphConfig
:members: enable_amp,
enable_zero,
allow_fuse_model_update_ops,
allow_fuse_add_to_output,
allow_fuse_cast_scale,
set_gradient_accumulation_steps,
enable_cudnn_conv_heuristic_search_algo,
enable_straighten_algorithm,
:member-order: bysource
.. autoclass:: oneflow.nn.graph.block_config.BlockConfig
:members: stage_id,
set_stage,
activation_checkpointing,
:member-order: bysource
OneFlow runs in Eager mode by default.
OneFlow's nn.Graph is programmed in a style very similar to Eager Mode, so it is possible to make small changes and get large performance gains.
The following script shows the process of building a neural network in eager mode using the interface under ``oneflow.nn`` :
.. code-block::
import oneflow as flow
import oneflow.nn as nn
class ModuleMyLinear(nn.Module):
def __init__(self, in_features, out_features):
super().__init__()
self.weight = nn.Parameter(flow.randn(in_features, out_features))
self.bias = nn.Parameter(flow.randn(out_features))
def forward(self, input):
return flow.matmul(input, self.weight) + self.bias
linear_model = ModuleMyLinear(4, 3)
Eager ``nn.Module`` can be reused by ``nn.Graph``. The above script for eager mode can be changed to static Graph mode by adding just a few lines of code, which consists of the following steps:
- Define your customized graph as a subclass of ``nn.Graph``
- At the beginning of __init__. Call super().__init__() to let OneFlow do the necessary initialization of the Graph
- Reuse the ``nn.Module`` object in Eager mode in __init__ (self.model = model)
- Describe the computation in the ``build`` method
- Instantiate your graph then call it.
.. code-block::
class GraphMyLinear(nn.Graph):
def __init__(self):
super().__init__()
self.model = linear_model
def build(self, input):
return self.model(input)
graph_mylinear = GraphMyLinear()
input = flow.randn(1, 4)
out = graph_mylinear(input)
print(out)
tensor([[-0.3298, -3.7907, 0.1661]], dtype=oneflow.float32)
.. _static graph:
Static Graph Mode
------------------------------------------------------------
Constructing a Graph
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Base class for training or evaluating a neural network in static graph mode.
.. currentmodule:: oneflow.nn.Graph
.. autosummary::
:toctree: generated
:nosignatures:
__init__
build
add_optimizer
set_grad_scaler
Executing a Graph
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Call a nn.Graph instance to run a customized graph.
.. currentmodule:: oneflow.nn.Graph
.. autosummary::
:toctree: generated
:nosignatures:
__call__
Config options on a Graph
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Optimization options of a nn.Graph.
.. currentmodule:: oneflow.nn.graph.graph_config.GraphConfig
.. autosummary::
:toctree: generated
:nosignatures:
enable_amp
enable_zero
allow_fuse_model_update_ops
allow_fuse_add_to_output
allow_fuse_cast_scale
set_gradient_accumulation_steps
enable_cudnn_conv_heuristic_search_algo
enable_straighten_algorithm
enable_compress_memory
Config options on a GraphModule
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
GraphModule is the graph representation of a nn.Module in a nn.Graph.
When an nn.Module is added into an nn.Graph, it is wrapped into a ProxyModule. The ProxyModule has a GraphModule inside it.
You can get and set the GraphModule to enable graph optimization on the nn.Module.
.. currentmodule:: oneflow.nn.graph.graph_block.GraphModule
.. autosummary::
:toctree: generated
:nosignatures:
set_stage
activation_checkpointing
Save & Load a Model
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. currentmodule:: oneflow.nn.Graph
.. autosummary::
:toctree: generated
:nosignatures:
state_dict
load_state_dict
Debug a Graph
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autosummary::
:toctree: generated
:nosignatures:
__repr__
debug
name
oneflow.hub
===================================
.. The documentation is referenced from:
https://pytorch.org/docs/1.10/hub.html
Oneflow Hub is a pre-trained model repository designed to facilitate research reproducibility.
Publishing models
-----------------
Oneflow Hub supports publishing pre-trained models(model definitions and pre-trained weights)
to a github repository by adding a simple ``hubconf.py`` file;
``hubconf.py`` can have multiple entrypoints. Each entrypoint is defined as a python function
(example: a pre-trained model you want to publish).
::
def entrypoint_name(*args, **kwargs):
# args & kwargs are optional, for models which take positional/keyword arguments.
...
How to implement an entrypoint?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Here is a code snippet specifies an entrypoint for ``resnet18`` model if we expand
the implementation in ``Oneflow-Inc/vision/hubconf.py``.
In most case importing the right function in ``hubconf.py`` is sufficient. Here we
just want to use the expanded version as an example to show how it works.
You can see the full script in
`Oneflow-Inc/vision repo <https://github.com/Oneflow-Inc/vision/blob/master/hubconf.py>`_
::
dependencies = ['oneflow']
from flowvision.models.resnet import resnet18 as _resnet18
# resnet18 is the name of entrypoint
def resnet18(pretrained=False, **kwargs):
""" # This docstring shows up in hub.help()
Resnet18 model
pretrained (bool): kwargs, load pretrained weights into the model
"""
# Call the model, load pretrained weights
model = _resnet18(pretrained=pretrained, **kwargs)
return model
- ``dependencies`` variable is a **list** of package names required to **load** the model. Note this might
be slightly different from dependencies required for training a model.
- ``args`` and ``kwargs`` are passed along to the real callable function.
- Docstring of the function works as a help message. It explains what does the model do and what
are the allowed positional/keyword arguments. It's highly recommended to add a few examples here.
- Entrypoint function can either return a model(nn.module), or auxiliary tools to make the user workflow smoother, e.g. tokenizers.
- Callables prefixed with underscore are considered as helper functions which won't show up in :func:`oneflow.hub.list()`.
- Pretrained weights can either be stored locally in the github repo, or loadable by
:func:`oneflow.hub.load_state_dict_from_url()`. If less than 2GB, it's recommended to attach it to a `project release <https://help.github.com/en/articles/distributing-large-binaries>`_
and use the url from the release.
In the example above ``flowvision.models.resnet.resnet18`` handles ``pretrained``, alternatively you can put the following logic in the entrypoint definition.
::
if pretrained:
# For checkpoint saved in local github repo, e.g. <RELATIVE_PATH_TO_CHECKPOINT>=weights/save.pth
dirname = os.path.dirname(__file__)
checkpoint = os.path.join(dirname, <RELATIVE_PATH_TO_CHECKPOINT>)
state_dict = oneflow.load(checkpoint)
model.load_state_dict(state_dict)
# For checkpoint saved elsewhere
checkpoint = 'https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip'
model.load_state_dict(oneflow.hub.load_state_dict_from_url(checkpoint, progress=False))
Important Notice
^^^^^^^^^^^^^^^^
- The published models should be at least in a branch/tag. It can't be a random commit.
Loading models from Hub
-----------------------
OneFlow Hub provides convenient APIs to explore all available models in hub
through :func:`oneflow.hub.list()`, show docstring and examples through
:func:`oneflow.hub.help()` and load the pre-trained models using
:func:`oneflow.hub.load()`.
.. automodule:: oneflow.hub
.. autofunction:: list
.. autofunction:: help
.. autofunction:: load
.. autofunction:: download_url_to_file
.. autofunction:: load_state_dict_from_url
Running a loaded model:
^^^^^^^^^^^^^^^^^^^^^^^
Note that ``*args`` and ``**kwargs`` in :func:`oneflow.hub.load()` are used to
**instantiate** a model. After you have loaded a model, how can you find out
what you can do with the model?
A suggested workflow is
- ``dir(model)`` to see all available methods of the model.
- ``help(model.foo)`` to check what arguments ``model.foo`` takes to run
To help users explore without referring to documentation back and forth, we strongly
recommend repo owners make function help messages clear and succinct. It's also helpful
to include a minimal working example.
Where are my downloaded models saved?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The locations are used in the order of
- Calling ``hub.set_dir(<PATH_TO_HUB_DIR>)``
- ``$ONEFLOW_HOME/hub``, if environment variable ``ONEFLOW_HOME`` is set.
- ``$XDG_CACHE_HOME/oneflow/hub``, if environment variable ``XDG_CACHE_HOME`` is set.
- ``~/.cache/oneflow/hub``
.. autofunction:: get_dir
.. autofunction:: set_dir
Caching logic
^^^^^^^^^^^^^
By default, we don't clean up files after loading it. Hub uses the cache by default if it already exists in the
directory returned by :func:`~oneflow.hub.get_dir()`.
Users can force a reload by calling ``hub.load(..., force_reload=True)``. This will delete
the existing github folder and downloaded weights, reinitialize a fresh download. This is useful
when updates are published to the same branch, users can keep up with the latest release.
Known limitations:
^^^^^^^^^^^^^^^^^^
Oneflow hub works by importing the package as if it was installed. There are some side effects
introduced by importing in Python. For example, you can see new items in Python caches
``sys.modules`` and ``sys.path_importer_cache`` which is normal Python behavior.
This also means that you may have import errors when importing different models
from different repos, if the repos have the same sub-package names (typically, a
``model`` subpackage). A workaround for these kinds of import errors is to
remove the offending sub-package from the ``sys.modules`` dict; more details can
be found in `this github issue
<https://github.com/pytorch/hub/issues/243#issuecomment-942403391>`_.
A known limitation that is worth mentioning here: users **CANNOT** load two different branches of
the same repo in the **same python process**. It's just like installing two packages with the
same name in Python, which is not good. Cache might join the party and give you surprises if you
actually try that. Of course it's totally fine to load them in separate processes.
......@@ -3,9 +3,14 @@ oneflow.nn.image
Image operations for neural networks
--------------------------------------
.. currentmodule:: oneflow.nn.image
.. automodule:: oneflow.nn.image
:members: Resize,
batch_align,
decode,
flip,
.. autosummary::
:toctree: generated
:nosignatures:
Resize
batch_align
decode
flip
normalize
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment