Commit a715222c authored by yuguo's avatar yuguo
Browse files

0.9.1-rocm

parent f262efc9
......@@ -45,7 +45,7 @@ file(
"${PROJECT_SOURCE_DIR}/oneflow/user/*.*"
"${PROJECT_SOURCE_DIR}/oneflow/api/*.*"
"${PROJECT_SOURCE_DIR}/oneflow/maybe/*.*"
"${PROJECT_SOURCE_DIR}/oneflow/extension/python/*.*")
"${PROJECT_SOURCE_DIR}/oneflow/extension/*.*")
foreach(oneflow_single_file ${oneflow_all_src})
# Verify whether this file is for other platforms
......@@ -80,6 +80,21 @@ foreach(oneflow_single_file ${oneflow_all_src})
if(BUILD_CUDA)
list(APPEND of_all_obj_cc ${oneflow_single_file})
endif()
if(BUILD_ROCM)
if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/(core|user)/.*\\.cu$")
get_filename_component(oneflow_single_file_hip_cpp_dir ${oneflow_single_file} DIRECTORY)
get_filename_component(oneflow_single_file_hip_cpp ${oneflow_single_file} NAME_WE)
add_custom_command(
OUTPUT "${oneflow_single_file_hip_cpp_dir}/${oneflow_single_file_hip_cpp}_hip.cpp"
COMMAND ${CMAKE_COMMAND} -E copy "${oneflow_single_file}" "${oneflow_single_file_hip_cpp_dir}/${oneflow_single_file_hip_cpp}_hip.cpp"
DEPENDS "${oneflow_single_file}"
)
list(APPEND of_all_obj_cc ${oneflow_single_file_hip_cpp_dir}/${oneflow_single_file_hip_cpp}_hip.cpp)
endif()
if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/(core|user)/.*\\.cuh$")
list(APPEND of_all_obj_cc ${oneflow_single_file})
endif()
endif()
set(group_this ON)
endif()
......@@ -96,8 +111,7 @@ foreach(oneflow_single_file ${oneflow_all_src})
set(group_this ON)
endif()
if("${oneflow_single_file}" MATCHES
"^${PROJECT_SOURCE_DIR}/oneflow/extension/python/.*\\.(h|cpp)$")
if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/extension/.*\\.(c|h|cpp)$")
list(APPEND of_pyext_obj_cc ${oneflow_single_file})
set(group_this ON)
endif()
......@@ -105,7 +119,7 @@ foreach(oneflow_single_file ${oneflow_all_src})
if("${oneflow_single_file}" MATCHES "^${PROJECT_SOURCE_DIR}/oneflow/(core|user|maybe)/.*\\.cpp$")
if("${oneflow_single_file}" MATCHES
"^${PROJECT_SOURCE_DIR}/oneflow/(core|user|maybe)/.*_test\\.cpp$")
"^${PROJECT_SOURCE_DIR}/oneflow/(core|user|maybe|thread)/.*_test\\.cpp$")
# test file
list(APPEND of_all_test_cc ${oneflow_single_file})
elseif(APPLE AND "${oneflow_single_file}" MATCHES
......@@ -136,6 +150,7 @@ add_custom_target(
of_format
COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_license_format.py -i
${CMAKE_CURRENT_SOURCE_DIR}/oneflow --fix
--exclude="oneflow/user/kernels/fmha_flash_attention"
COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_license_format.py -i
${ONEFLOW_PYTHON_DIR} --fix --exclude="oneflow/include" --exclude="oneflow/core"
COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/ci/check/run_clang_format.py --source_dir
......@@ -254,20 +269,22 @@ if("${LLVM_MONO_REPO_URL}" STREQUAL
"https://github.com/llvm/llvm-project/archive/6a9bbd9f20dcd700e28738788bb63a160c6c088c.zip"
OR "${LLVM_MONO_REPO_URL}" STREQUAL
"https://github.com/llvm/llvm-project/archive/32805e60c9de1f82887cd2af30d247dcabd2e1d3.zip"
OR "${LLVM_MONO_REPO_URL}" STREQUAL
"https://github.com/llvm/llvm-project/archive/6d6268dcbf0f48e43f6f9fe46b3a28c29ba63c7d.zip"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "f2f17229cf21049663b8ef4f2b6b8062"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "6b7c6506d5922de9632c8ff012b2f945"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e0ea669a9f0872d35bffda5ec6c5ac6f"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "241a333828bba1efa35aff4c4fc2ce87"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "075fbfdf06cb3f02373ea44971af7b03"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e412dc61159b5e929b0c94e44b11feb2")
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "e412dc61159b5e929b0c94e44b11feb2"
OR "${LLVM_MONO_REPO_MD5}" STREQUAL "334997b4879aba15d9323a732356cf2a")
unset(LLVM_MONO_REPO_URL CACHE)
unset(LLVM_MONO_REPO_MD5 CACHE)
endif()
set(LLVM_MONO_REPO_URL
"https://github.com/llvm/llvm-project/archive/6d6268dcbf0f48e43f6f9fe46b3a28c29ba63c7d.zip"
set(LLVM_MONO_REPO_URL "https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-15.0.6.zip"
CACHE STRING "")
use_mirror(VARIABLE LLVM_MONO_REPO_URL URL ${LLVM_MONO_REPO_URL})
set(LLVM_MONO_REPO_MD5 "334997b4879aba15d9323a732356cf2a" CACHE STRING "")
set(LLVM_MONO_REPO_MD5 "1ccc00accc87a1a5d42a275d6e31cd8c" CACHE STRING "")
set(ONEFLOW_BUILD_ROOT_DIR "${PROJECT_BINARY_DIR}")
add_subdirectory(${PROJECT_SOURCE_DIR}/oneflow/ir)
if(WITH_MLIR)
......@@ -306,9 +323,9 @@ elseif(UNIX)
${oneflow_third_party_libs}
${EXTERNAL_TARGETS}
-Wl,--no-whole-archive
-Wl,--as-needed
-ldl
-lrt
-Wl,--version-script ${PROJECT_SOURCE_DIR}/version_script.lds)
-lrt)
if(BUILD_CUDA)
target_link_libraries(oneflow CUDA::cudart_static)
endif()
......@@ -317,6 +334,43 @@ elseif(WIN32)
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /WHOLEARCHIVE:oneflow")
endif()
if (BUILD_ROCM)
# AMD compiler fails to compile these three files with '-O1/2/3'.
# The value of `COMPILE_OPTIONS` target property is added after CMAKE_<LANG>_FLAGS_<CONFIG>,
# so '-O0' will override '-O1/2/3'.
set_source_files_properties(${PROJECT_SOURCE_DIR}/oneflow/user/kernels/median_with_indices_kernel_hip.cpp
${PROJECT_SOURCE_DIR}/oneflow/user/kernels/radix_sort_top_k_kernel_hip.cpp
${PROJECT_SOURCE_DIR}/oneflow/user/kernels/arg_sort_kernel_hip.cpp
#${PROJECT_SOURCE_DIR}/oneflow/core/ep/cuda/primitive/broadcast_elementwise_binary_math1_hip.cpp
PROPERTIES COMPILE_OPTIONS "-O0")
endif()
if(BUILD_CUDA)
string(JOIN "," CUDA_REAL_ARCHS ${CUDA_REAL_ARCHS_LIST})
set_source_files_properties(${PROJECT_SOURCE_DIR}/oneflow/core/hardware/cuda_device_descriptor.cpp
PROPERTIES COMPILE_FLAGS "-DCUDA_REAL_ARCHS=\"${CUDA_REAL_ARCHS}\"")
endif()
if(BUILD_CUDA AND WITH_CUTLASS)
if(CUDA_VERSION VERSION_GREATER_EQUAL "10.1")
add_definitions(-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1)
endif()
set_property(
SOURCE ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/fused_multi_head_attention_inference_kernel.cu
APPEND PROPERTY INCLUDE_DIRECTORIES
${CUTLASS_INSTALL_DIR}/examples/41_fused_multi_head_attention)
set_property(SOURCE ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/fused_glu_kernel.cu APPEND
PROPERTY INCLUDE_DIRECTORIES ${CUTLASS_INSTALL_DIR}/examples/45_dual_gemm)
if("${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA")
set_property(
SOURCE
${PROJECT_SOURCE_DIR}/oneflow/user/kernels/fused_multi_head_attention_inference_kernel.cu
APPEND
PROPERTY COMPILE_OPTIONS "--use_fast_math")
endif()
endif()
# oneflow api common
if(BUILD_PYTHON OR BUILD_CPP_API)
file(GLOB_RECURSE of_api_common_files ${PROJECT_SOURCE_DIR}/oneflow/api/common/*.h
......@@ -343,6 +397,8 @@ if(BUILD_PYTHON)
add_dependencies(of_pyext_obj oneflow)
pybind11_add_module(oneflow_internal ${PYBIND11_SRCS} ${of_pybind_obj_cc} ${PYBIND_REGISTRY_CC})
set_property(TARGET oneflow_internal APPEND PROPERTY BUILD_RPATH "\$ORIGIN/../nvidia/cublas/lib")
set_property(TARGET oneflow_internal APPEND PROPERTY BUILD_RPATH "\$ORIGIN/../nvidia/cudnn/lib")
set_compile_options_to_oneflow_target(oneflow_internal)
set_property(TARGET oneflow_internal PROPERTY CXX_VISIBILITY_PRESET "default")
add_dependencies(oneflow_internal of_functional_obj of_functional_tensor_obj of_op_schema)
......@@ -419,6 +475,9 @@ if(BUILD_TESTING)
oneflow_add_test(oneflow_testexe SRCS ${of_all_test_cc} TEST_NAME oneflow_test)
target_link_libraries(oneflow_testexe ${of_libs} ${oneflow_third_party_libs} glog::glog
${oneflow_test_libs})
if(WITH_MLIR)
target_link_libraries(oneflow_testexe MLIROneFlowExtension)
endif()
endif()
if(BUILD_CPP_API)
......@@ -524,6 +583,10 @@ if(BUILD_CPP_API)
if(BUILD_CUDA)
checkdirandappendslash(DIR ${NCCL_LIBRARY_DIR} OUTPUT NCCL_LIBRARY_DIR_APPENDED)
list(APPEND LIBONEFLOW_THIRD_PARTY_DIRS ${NCCL_LIBRARY_DIR_APPENDED})
if(WITH_CUTLASS)
checkdirandappendslash(DIR ${CUTLASS_LIBRARY_DIR} OUTPUT CUTLASS_LIBRARY_DIR_APPENDED)
list(APPEND LIBONEFLOW_THIRD_PARTY_DIRS ${CUTLASS_LIBRARY_DIR_APPENDED})
endif()
endif()
install(
......@@ -555,6 +618,7 @@ if(BUILD_CPP_API)
llvm-PerfectShuffle
llvm-tblgen
mlir-tblgen
mlir-pdll
obj2yaml
oneflow_tblgen
yaml-bench
......
......@@ -38,7 +38,13 @@ set(ONEFLOW_OP_GROUPS
"TRIGONOMETRIC"
"UNARY"
"UPSAMPLE"
"ONE_EMBEDDING")
"ONE_EMBEDDING"
"LINEAR_ALGEBRA"
"SYSTEM")
if(WITH_MLIR)
list(APPEND ONEFLOW_OP_GROUPS "MLIR_JIT")
endif(WITH_MLIR)
foreach(OP_GROUP_NAME IN LISTS ONEFLOW_OP_GROUPS)
list(APPEND ONEFLOW_SCHEMA_TABLEGEN_FLAGS "-DGET_ONEFLOW_${OP_GROUP_NAME}_OP_DEFINITIONS")
endforeach()
......
......@@ -137,7 +137,8 @@ endif()
list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${RE2_INCLUDE_DIR})
if(BUILD_CUDA)
if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
# Always use third_party/cub for Clang CUDA in case of compatibility issues
if("${CMAKE_CUDA_COMPILER_ID}" STREQUAL "NVIDIA" AND CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
if(CMAKE_CXX_STANDARD LESS 14)
add_definitions(-DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT)
add_definitions(-DCUB_IGNORE_DEPRECATED_CPP11)
......@@ -150,6 +151,7 @@ if(BUILD_CUDA)
list(APPEND oneflow_third_party_dependencies cub_copy_headers_to_destination)
endif()
include(nccl)
include(cutlass)
list(APPEND oneflow_third_party_libs ${NCCL_LIBRARIES})
list(APPEND oneflow_third_party_libs ${CUDNN_LIBRARIES})
......@@ -159,12 +161,19 @@ if(BUILD_CUDA)
list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUDNN_INCLUDE_DIRS} ${CUB_INCLUDE_DIR}
${NCCL_INCLUDE_DIR})
endif()
if(WITH_CUTLASS)
list(APPEND oneflow_third_party_dependencies cutlass)
list(APPEND oneflow_third_party_dependencies cutlass_copy_examples_to_destination)
list(APPEND oneflow_third_party_libs ${CUTLASS_LIBRARIES})
list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${CUTLASS_INCLUDE_DIR})
endif()
endif()
if (BUILD_ROCM)
# Find rocm packages
find_package(hip)
find_package(hipfft)
find_package(hipblas)
find_package(hipcub)
find_package(hiprand)
......@@ -173,11 +182,31 @@ if (BUILD_ROCM)
find_package(rccl)
add_definitions(-DWITH_ROCM)
add_definitions(-D__HIP_PLATFORM_HCC__)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
add_definitions(-D__HIPCC__)
if (BUILD_ROCM_GRAPHS)
add_definitions(-DWITH_ROCM_GRAPHS)
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ -D__HIPCC__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ -D__HIPCC__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --gpu-max-threads-per-block=1024")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-macro-redefined")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-inconsistent-missing-override")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-exceptions")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-shift-count-negative")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-shift-count-overflow")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-duplicate-decl-specifier")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-implicit-int-float-conversion")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pass-failed")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-gpu-rdc")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mcmodel=large")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -mcmodel=large")
list(APPEND oneflow_third_party_libs hip::device)
list(APPEND oneflow_third_party_libs hip::hipfft)
list(APPEND oneflow_third_party_libs roc::hipblas)
list(APPEND oneflow_third_party_libs hip::hipcub)
list(APPEND oneflow_third_party_libs roc::rocrand)
......@@ -185,17 +214,18 @@ if (BUILD_ROCM)
list(APPEND oneflow_third_party_libs MIOpen)
link_directories(${ROCM_PATH}/rccl/lib)
list(APPEND oneflow_third_party_libs rccl)
list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${HIP_INCLUDE_DIRS}
list(APPEND ONEFLOW_THIRD_PARTY_INCLUDE_DIRS ${HIP_INCLUDE_DIRS}
${HIPFFT_INCLUDE_DIRS}
${HIPBLAS_INCLUDE_DIRS}
${HIPCUB_INCLUDE_DIRS}
"${ROCM_PATH}/hiprand/include"
"${ROCM_PATH}/rocrand/include"
"${ROCM_PATH}/roctracer/include"
${MIOPEN_INCLUDE_DIRS}
${RCCL_INCLUDE_DIRS})
message(STATUS "ONEFLOW_THIRD_PARTY_INCLUDE_DIRS: ${ONEFLOW_THIRD_PARTY_INCLUDE_DIRS}")
endif()
if(BUILD_RDMA)
if(UNIX)
include(CheckIncludeFiles)
......
include(ExternalProject)
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(WITH_CUTLASS_INIT OFF)
else()
set(WITH_CUTLASS_INIT ON)
endif()
set(WITH_CUTLASS ${WITH_CUTLASS_INIT} CACHE BOOL "")
if(WITH_CUTLASS)
add_definitions(-DWITH_CUTLASS)
find_package(Threads)
set(CUTLASS_PROJECT cutlass)
set(CUTLASS_INSTALL_DIR ${THIRD_PARTY_DIR}/cutlass)
set(CUTLASS_INCLUDE_DIR ${CUTLASS_INSTALL_DIR}/include CACHE PATH "" FORCE)
set(CUTLASS_LIBRARY_DIR ${CUTLASS_INSTALL_DIR}/lib CACHE PATH "" FORCE)
set(CUTLASS_LIBRARIES ${CUTLASS_LIBRARY_DIR}/libcutlass.so)
set(CUTLASS_SOUREC_DIR ${CMAKE_CURRENT_BINARY_DIR}/cutlass/src/cutlass/)
foreach(arch ${CUDA_REAL_ARCHS_LIST})
if(arch GREATER_EQUAL 70)
list(APPEND CUTLASS_REAL_ARCHS ${arch})
endif()
endforeach()
if(THIRD_PARTY)
ExternalProject_Add(
${CUTLASS_PROJECT}
PREFIX cutlass
URL ${CUTLASS_URL}
URL_MD5 ${CUTLASS_MD5}
UPDATE_COMMAND ""
BUILD_BYPRODUCTS ${CUTLASS_LIBRARIES}
CMAKE_ARGS -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
-DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_DEBUG:STRING=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_CXX_FLAGS_RELEASE:STRING=${CMAKE_CXX_FLAGS_RELEASE}
CMAKE_CACHE_ARGS
-DCMAKE_CUDA_COMPILER:STRING=${CUDAToolkit_NVCC_EXECUTABLE}
-DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER}
-DCMAKE_CXX_COMPILER_LAUNCHER:STRING=${CMAKE_CXX_COMPILER_LAUNCHER}
-DCMAKE_INSTALL_PREFIX:PATH=${CUTLASS_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${CUTLASS_LIBRARY_DIR}
-DCMAKE_INSTALL_MESSAGE:STRING=${CMAKE_INSTALL_MESSAGE}
-DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
-DCUTLASS_LIBRARY_OPERATIONS:STRING=conv2d
-DCUTLASS_LIBRARY_KERNELS:STRING=simt_hfprop_*,tensorop_f16_*fprop,tensorop_h*fprop
-DCUTLASS_ENABLE_EXAMPLES:BOOL=OFF
-DCUTLASS_ENABLE_PROFILER:BOOL=OFF
-DCUTLASS_ENABLE_LIBRARY:BOOL=ON
-DCUTLASS_NVCC_ARCHS:STRING=${CUTLASS_REAL_ARCHS}
-DCUTLASS_ENABLE_TESTS:BOOL=OFF
-DCUTLASS_UNITY_BUILD_ENABLED:BOOL=ON
-DCUTLASS_LIBRARY_DEBUG_POSTFIX:STRING=
-DCUTLASS_NVCC_EMBED_PTX:BOOL=OFF)
add_custom_target(cutlass_copy_examples_to_destination DEPENDS cutlass)
set(CUTLASS_SOURCE_EXAMPLES_DIR ${CUTLASS_SOUREC_DIR}/examples)
set(CUTLASS_INSTALL_EXAMPLES_FILES
"41_fused_multi_head_attention/iterators/make_residual_last.h"
"41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h"
"41_fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h"
"41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h"
"41_fused_multi_head_attention/mma_from_smem.h"
"41_fused_multi_head_attention/epilogue_rescale_output.h"
"41_fused_multi_head_attention/attention_scaling_coefs_updater.h"
"41_fused_multi_head_attention/gemm_kernel_utils.h"
"41_fused_multi_head_attention/fmha_grouped_problem_visitor.h"
"41_fused_multi_head_attention/fmha_grouped.h"
"41_fused_multi_head_attention/default_fmha_grouped.h"
"41_fused_multi_head_attention/epilogue_pipelined.h"
"41_fused_multi_head_attention/epilogue_thread_apply_logsumexp.h"
"41_fused_multi_head_attention/kernel_forward.h"
"41_fused_multi_head_attention/gemm/custom_mma_multistage.h"
"41_fused_multi_head_attention/gemm/custom_mma_base.h"
"41_fused_multi_head_attention/gemm/custom_mma.h"
"41_fused_multi_head_attention/gemm/custom_mma_pipelined.h"
"41_fused_multi_head_attention/find_default_mma.h"
"41_fused_multi_head_attention/debug_utils.h"
"45_dual_gemm/test_run.h"
"45_dual_gemm/kernel/dual_gemm.h"
"45_dual_gemm/device/dual_gemm.h"
"45_dual_gemm/dual_gemm_run.h"
"45_dual_gemm/thread/left_silu_and_mul.h"
"45_dual_gemm/threadblock/dual_mma_multistage.h"
"45_dual_gemm/threadblock/dual_epilogue.h"
"45_dual_gemm/threadblock/dual_mma_base.h")
foreach(filename ${CUTLASS_INSTALL_EXAMPLES_FILES})
add_custom_command(
TARGET cutlass_copy_examples_to_destination
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CUTLASS_SOURCE_EXAMPLES_DIR}/${filename}
${CUTLASS_INSTALL_DIR}/examples/${filename})
endforeach()
endif(THIRD_PARTY)
endif(WITH_CUTLASS)
......@@ -7,12 +7,9 @@ set(EIGEN_URL https://github.com/Oneflow-Inc/eigen-git-mirror/archive/refs/tags/
set(EIGEN_MD5 a23cb70e12d1bf9b09cb28af51bc26ae)
use_mirror(VARIABLE EIGEN_URL URL ${EIGEN_URL})
add_definitions(-DEIGEN_NO_AUTOMATIC_RESIZING)
if(BUILD_CUDA)
add_definitions(-DEIGEN_USE_GPU)
endif()
add_definitions(-DEIGEN_NO_MALLOC)
#add_definitions(-DEIGEN_NO_AUTOMATIC_RESIZING -DEIGEN_NO_MALLOC -DEIGEN_USE_GPU)
if(THIRD_PARTY)
......
......@@ -34,11 +34,36 @@ else()
set(NCCL_INCLUDE_DIR ${NCCL_INSTALL_DIR}/include)
set(NCCL_LIBRARY_DIR ${NCCL_INSTALL_DIR}/lib)
set(NCCL_URL https://github.com/NVIDIA/nccl/archive/refs/tags/v2.12.10-1.tar.gz)
# Versions 2.13 and above may cause deadlocks
if(CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
set(NCCL_URL https://github.com/NVIDIA/nccl/archive/refs/tags/v2.15.1-1.tar.gz)
set(NCCL_MD5 37b787ff8934cd9374b4612f663c17fa)
else()
set(NCCL_URL https://github.com/NVIDIA/nccl/archive/refs/tags/v2.12.10-1.tar.gz)
set(NCCL_MD5 bdb91f80b78c99831f09ca8bb28a1032)
endif()
use_mirror(VARIABLE NCCL_URL URL ${NCCL_URL})
list(APPEND NCCL_LIBRARIES ${NCCL_LIBRARY_DIR}/${NCCL_LIBRARY_NAME})
set(NCCL_ARCHS_LIST ${CUDA_REAL_ARCHS_LIST})
# remove redundant archs, https://github.com/NVIDIA/nccl/blob/cb111f764a6d46370f24f75101d6b219bb2dda54/makefiles/common.mk#L28
if("70" IN_LIST NCCL_ARCHS_LIST AND "75" IN_LIST NCCL_ARCHS_LIST)
list(REMOVE_ITEM NCCL_ARCHS_LIST "75")
endif()
if("80" IN_LIST NCCL_ARCHS_LIST AND "86" IN_LIST NCCL_ARCHS_LIST)
list(REMOVE_ITEM NCCL_ARCHS_LIST "86")
endif()
if("80" IN_LIST NCCL_ARCHS_LIST AND "89" IN_LIST NCCL_ARCHS_LIST)
list(REMOVE_ITEM NCCL_ARCHS_LIST "89")
endif()
foreach(arch ${NCCL_ARCHS_LIST})
string(APPEND NCCL_GENCODE "-gencode=arch=compute_${arch},code=sm_${arch} ")
endforeach()
if(THIRD_PARTY)
include(ProcessorCount)
......@@ -47,11 +72,12 @@ else()
nccl
PREFIX nccl
URL ${NCCL_URL}
URL_MD5 bdb91f80b78c99831f09ca8bb28a1032
URL_MD5 ${NCCL_MD5}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
BUILD_COMMAND make -j${PROC_NUM} src.build CUDA_HOME=${CUDATOOLKIT_BIN_ROOT}
NVCC_GENCODE=${NCCL_GENCODE}
INSTALL_COMMAND make src.install PREFIX=${NCCL_INSTALL_DIR}
BUILD_BYPRODUCTS ${NCCL_LIBRARIES})
......
......@@ -14,3 +14,4 @@ dataclasses; python_version<"3.7"
cmakelang==0.6.13
pytest-xdist
rich
portalocker
Auto Parallelism
====================================================
As the scale of deep-learning models grows larger and larger, distributed training,
or parallelism, is needed. Data parallelism and model parallelism has been designed
to speed up the training and solve memory issues.
In oneflow, SBP signature enables users to configure parallelism policy easily.
However, users still need to specify the SBP property for each operator, or most of them.
Users might spend a couple of days digging into the detail of parallelism and get a
low throughput just because of a slight mistake in the configuration of SBP signature.
.. note::
It only works on :doc:`graph` mode.
Our strength
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
To get rid of all those configurations for SBP signatures, we developed auto parallelism.
Still, configurations of placement are necessary and we have not supported auto placement
yet. If you read this paragraph before you rush into any SBP stuff, then congratulation,
you do not need to learn SBPs. You can start writing your code as you did under CPU mode.
Our auto parallelism would generate a fast strategy customized for your specific models,
the size of parameters, and the number of available GPUs.
How to use auto parallelism?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You just need to simply enable the configuration settings in the model
of :doc:`graph` .
Example::
import oneflow as flow
class SubclassGraph(flow.nn.Graph):
def __init__(self):
super().__init__() # MUST be called
# auto parallelism configuration
self.config.enable_auto_parallel(True)
# other configurations about auto parallelism
# ......
def build(self):
pass
.. warning::
If you enable auto parallelism, OneFlow will take care of the SBP configurations
of operators except for explicit ``to_global`` functions.
Configuration API for auto parallelism
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. currentmodule:: oneflow.nn.graph.graph_config.GraphConfig
.. autosummary::
:toctree: generated
:nosignatures:
enable_auto_parallel
enable_auto_parallel_ignore_user_sbp_config
set_auto_parallel_computation_cost_ratio
set_auto_parallel_wait_time
enable_auto_parallel_trunk_algo
enable_auto_parallel_sbp_collector
oneflow.autograd
================================================
Functions and classes for autograd.
---------------------------------------------------
====================================================
.. The documentation is referenced from:
https://pytorch.org/docs/1.10/autograd.html
``oneflow.autograd`` provides classes and functions implementing automatic differentiation of arbitrary scalar
valued functions. It requires minimal changes to the existing code - you only need to declare ``Tensor`` s
for which gradients should be computed with the ``requires_grad=True`` keyword. As of now, we only support
autograd for floating point ``Tensor`` types ( half, float, double and bfloat16).
.. currentmodule:: oneflow.autograd
.. autoclass:: oneflow.autograd.Function
:members: apply,
:special-members: __call__,
.. automodule:: oneflow.autograd
:members: grad,
backward,
.. autosummary::
:toctree: generated
:nosignatures:
backward
grad
Locally disabling gradient computation
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autosummary::
:toctree: generated
:nosignatures:
no_grad
enable_grad
set_grad_enabled
inference_mode
.. TODO(wyg): uncomment this after aligning accumulate grad
.. Default gradient layouts
.. ^^^^^^^^^^^^^^^^^^^^^^^^
.. A ``param.grad`` is accumulated by replacing ``.grad`` with a
.. new tensor ``.grad + new grad`` during :func:`oneflow.autograd.backward()` or
.. :func:`oneflow.Tensor.backward()`.
In-place operations on Tensors
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Supporting in-place operations in autograd is a hard matter, and we discourage
their use in most cases. Autograd's aggressive buffer freeing and reuse makes
it very efficient and there are very few occasions when in-place operations
actually lower memory usage by any significant amount. Unless you're operating
under heavy memory pressure, you might never need to use them.
Tensor autograd functions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autosummary::
:nosignatures:
oneflow.Tensor.grad
oneflow.Tensor.requires_grad
oneflow.Tensor.is_leaf
oneflow.Tensor.backward
oneflow.Tensor.detach
oneflow.Tensor.register_hook
oneflow.Tensor.retain_grad
Function
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autoclass:: Function
.. currentmodule:: oneflow.autograd
.. autosummary::
:toctree: generated
:nosignatures:
Function.forward
Function.backward
Function.apply
Context method mixins
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
When creating a new :class:`Function`, the following methods are available to `ctx`.
.. currentmodule:: oneflow._oneflow_internal.autograd.Function
.. autosummary::
:toctree: generated
:nosignatures:
FunctionCtx.mark_non_differentiable
FunctionCtx.save_for_backward
FunctionCtx.saved_tensors
oneflow.comm
===================================
oneflow communication function
----------------------------------
.. currentmodule:: oneflow.comm
.. automodule:: oneflow.comm
:members: all_reduce,
all_gather,
broadcast,
scatter,
all_to_all,
reduce,
gather,
reduce_scatter,
send,
recv,
barrier,
......@@ -45,9 +45,14 @@ extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.napoleon",
"recommonmark",
"sphinx.ext.autosummary",
"sphinx_copybutton",
]
# build the templated autosummary files
autosummary_generate = True
numpydoc_show_class_members = False
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
......@@ -107,7 +112,6 @@ html_static_path = ["_static"]
#
# html_sidebars = {}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
......
oneflow.cuda
===================================
ONEFLOW.CUDA
----------------------------------
.. The documentation is referenced from: https://pytorch.org/docs/1.10/cuda.html.
.. currentmodule:: oneflow.cuda
.. automodule:: oneflow.cuda
:members: is_available,
device_count,
current_device,
set_device,
synchronize,
manual_seed_all,
manual_seed,
empty_cache,
HalfTensor,
FloatTensor,
DoubleTensor,
BoolTensor,
ByteTensor,
CharTensor,
IntTensor,
LongTensor,
\ No newline at end of file
.. autosummary::
:toctree: generated
:nosignatures:
is_available
device_count
current_device
set_device
synchronize
get_device_properties
get_device_capability
get_device_name
.. note::
The :attr:`current_device` returns local rank as device index. It is different from the 'torch.current_device()' in PyTorch.
Random Number Generator
-------------------------
.. autosummary::
:toctree: generated
:nosignatures:
manual_seed_all
manual_seed
GPU tensor
-----------------------------
.. autosummary::
:toctree: generated
:nosignatures:
HalfTensor
FloatTensor
DoubleTensor
BoolTensor
ByteTensor
CharTensor
IntTensor
LongTensor
Memory management
-----------------------------
.. autosummary::
:toctree: generated
:nosignatures:
empty_cache
\ No newline at end of file
oneflow.distributed
=========================================================
.. note ::
Please refer to `OneFlow Distributed Overview <https://docs.oneflow.org/master/parallelism/01_introduction.html>`__
for a brief introduction to all features related to distributed training.
OneFlow provides two ways to accomplish `Distributed Training`:
- The first way is that users are recommended to use OneFlow's global Tensor for distributed training. Global Tensor regards the computing cluster as a supercomputing device, allowing users to write distributed training code just like in a single-machine environment.
- OneFlow also provides a DDP(DistributedDataParallel) module aligned with PyTorch. DDP has been well-known and widely used in data parallelism by the majority of PyTorch users. Also see `PyTorch DDP introduction <https://pytorch.org/docs/1.10/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`_.
Basic
-------------------------------
When you start distributed training in OneFlow, the following functions can be used.
.. currentmodule:: oneflow.env
.. autosummary::
:toctree: generated
:nosignatures:
get_world_size
get_rank
get_local_rank
get_node_size
init_rdma
rdma_is_initialized
`Global Tensor`
--------------------------------------------------------------
Construct `Global Tensor`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
A `Global Tensor` can be created with a ``placement`` and a ``sbp``. The ``placement`` describes the physical devices of the global tensor will be allocated, and the ``sbp`` describes its distribution among these devices.
::
>>>import oneflow as flow
>>> # Place a global tensor on cuda device of rank(process) 0 and 1
>>> placement = flow.placement(type="cuda", ranks=[0, 1])
>>> # Each rank's local data is a part data as a result of spliting global data on dim 0
>>> sbp = flow.sbp.split(dim=0)
>>> # Create a global tensor by randn
>>> x = flow.randn(4, 5, placement=placement, sbp=sbp)
>>> x.shape
oneflow.Size([4, 5])
Convert `Local Tensor` to `Global Tensor`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
With ``Tensor.to_global`` interface, `Local Tensor` can create a `Global Tensor` and use that `Local Tensor` as its local component at the current node.
Two `local tensors` with the shape of ``(2,5)`` are created separately on two devices. While after the ``to_global`` method, the `global tensor` with a shape of ``(4,5)`` is obtained.
Code running on Node 0
::
import oneflow as flow
x = flow.randn(2,5)
placement = flow.placement("cuda", [0,1])
sbp = flow.sbp.split(0)
x_global = x.to_global(placement=placement, sbp=sbp)
x_global.shape
Code running on Node 1
::
import oneflow as flow
x = flow.randn(2,5)
placement = flow.placement("cuda", [0,1])
sbp = flow.sbp.split(0)
x_global = x.to_global(placement=placement, sbp=sbp)
x_global.shape
Redistribute `Global Tensor`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Redistributing a `Global Tensor` means moving its data to another device group (or placement), or changing its data distribution (or SBP) across the group, or both at the same time. The redistributed tensor is still a `Global Tensor`.
::
>>> import oneflow as flow
>>> x = flow.tensor([1.0, 2.0], placement=flow.placement("cuda", ranks=[0, 1]), sbp=flow.sbp.split(0))
>>> y = x.to_global(placement=flow.placement("cuda", ranks=[2, 3]), sbp=flow.sbp.broadcast)
According to the operator's semantics, OneFlow defines a sequence of valid input and output SBP combinations for each built-in operator. So OneFlow could automatically redistribute the `Global Tensor` to satisfy the operator's SBP requirements for its input Tensor. For example, the following code:
::
>>> import oneflow as flow
>>> x = flow.randn(4, 4,
placement=flow.placement("cuda", ranks=[0, 1]),
sbp=flow.sbp.split(0))
>>> y = flow.randn(4, 4,
placement=flow.placement("cuda", ranks=[0, 1]),
sbp=flow.sbp.split(1))
>>> z = x + y
When ``x + y`` is executed, since x is split along dimension ``0`` and y is split along dimension ``1``, their local components at each node can not be added directly, then OneFlow will automatically redistribute one of x and y to make them have the same SBP, and complete the add operation successfully.
.. note ::
- Global Tensor can not be used in combination with DDP currently.
- Global Tensor requires all devices to execute at the same pace, otherwise, it may cause multi-process deadlock.
Get Local Tensor from Global Tensor
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
With ``Tensor.to_local`` interface, the `Global Tensor` can return its local component at the current node.
::
y = x.to_local()
y.is_local
True
y
tensor([[ 2.9186e-01, -3.9442e-01, 4.7072e-04, -3.2216e-01, 1.7788e-01],
[-4.5284e-01, 1.2361e-01, -3.5962e-01, 2.6651e-01, 1.2951e+00]],
device='cuda:0', dtype=oneflow.float32)
DistributedDataParallel
--------------------------------------------------------------
For more information about DistributedDataParallel, see ``nn.parallel.DistributedDataParallel``
The following script shows the process of using ``oneflow.nn.parallel.DistributedDataParallel`` for training data parallel:
.. code-block::
import oneflow as flow
from oneflow.nn.parallel import DistributedDataParallel as ddp
train_x = [
flow.tensor([[1, 2], [2, 3]], dtype=flow.float32),
flow.tensor([[4, 6], [3, 1]], dtype=flow.float32),
]
train_y = [
flow.tensor([[8], [13]], dtype=flow.float32),
flow.tensor([[26], [9]], dtype=flow.float32),
]
class Model(flow.nn.Module):
def __init__(self):
super().__init__()
self.lr = 0.01
self.iter_count = 500
self.w = flow.nn.Parameter(flow.tensor([[0], [0]], dtype=flow.float32))
def forward(self, x):
x = flow.matmul(x, self.w)
return x
m = Model().to("cuda")
m = ddp(m)
loss = flow.nn.MSELoss(reduction="sum")
optimizer = flow.optim.SGD(m.parameters(), m.lr)
for i in range(0, m.iter_count):
rank = flow.env.get_rank()
x = train_x[rank].to("cuda")
y = train_y[rank].to("cuda")
y_pred = m(x)
l = loss(y_pred, y)
if (i + 1) % 50 == 0:
print(f"{i+1}/{m.iter_count} loss:{l}")
optimizer.zero_grad()
l.backward()
optimizer.step()
print(f"\nw:{m.w}")
There are only two differences between the data parallelism training code and the stand-alone single-card script:
- Use `DistributedDataParallel` to wrap the module object (`m = ddp(m)`)
- Use `get_rank` to get the current device number and distribute the data to the device.
Then use `launcher` to run the script, leave everything else to OneFlow, which makes distributed training as simple as stand-alone single-card training:
::
python3 -m oneflow.distributed.launch --nproc_per_node 2 ./ddp_train.py
Communication collectives
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. currentmodule:: oneflow.comm
.. autosummary::
:toctree: generated
:nosignatures:
all_reduce
all_gather
all_gather_into_tensor
all_to_all
broadcast
barrier
gather
reduce
reduce_scatter
reduce_scatter_tensor
recv
scatter
send
Launching distributed training
--------------------------------------------------------------
.. currentmodule:: oneflow.distributed
run commands below to see more about usage.
......
oneflow.distributions
==================================================
.. contents:: oneflow.distributions
:depth: 2
:local:
:class: this-will-duplicate-information-and-it-is-still-useful-here
:backlinks: top
.. currentmodule:: oneflow.distributions
.. autosummary::
:toctree: generated
:nosignatures:
:template: classtemplate.rst
Distribution
Categorical
oneflow.env
===================================
Environment
----------------------------------
.. currentmodule:: oneflow
.. autofunction:: oneflow.env.get_world_size
.. autofunction:: oneflow.env.get_rank
.. autofunction:: oneflow.env.get_local_rank
.. autofunction:: oneflow.env.get_node_size
.. autofunction:: oneflow.env.init_rdma
.. autofunction:: oneflow.env.rdma_is_initialized
This diff is collapsed.
oneflow.nn.functional
===========================================
Functional operations for neural networks
-------------------------------------------
.. currentmodule:: oneflow.nn.functional
.. autofunction:: conv1d
.. autofunction:: conv2d
.. autofunction:: conv3d
.. autofunction:: conv_transpose1d
.. autofunction:: conv_transpose2d
.. autofunction:: conv_transpose3d
.. autofunction:: adaptive_avg_pool1d
.. autofunction:: adaptive_avg_pool2d
.. autofunction:: adaptive_avg_pool3d
.. autofunction:: relu
.. autofunction:: hardsigmoid
.. autofunction:: hardshrink
.. autofunction:: hardswish
.. autofunction:: hardtanh
.. autofunction:: normalize
.. autofunction:: layer_norm
.. autofunction:: leaky_relu
.. autofunction:: elu
.. autofunction:: celu
.. autofunction:: selu
.. autofunction:: sigmoid
.. autofunction:: pad
.. autofunction:: prelu
.. autofunction:: logsigmoid
.. autofunction:: log_softmax
.. autofunction:: gelu
.. autofunction:: glu
.. autofunction:: softsign
.. autofunction:: softmax
.. autofunction:: softplus
.. autofunction:: tanh
.. autofunction:: threshold
.. autofunction:: softshrink
.. autofunction:: silu
.. autofunction:: mish
.. autofunction:: one_hot
.. autofunction:: triplet_margin_loss
.. autofunction:: dropout
.. autofunction:: affine_grid
.. autofunction:: grid_sample
.. autofunction:: interpolate
.. autofunction:: ctc_greedy_decoder
.. autofunction:: sparse_softmax_cross_entropy
.. autofunction:: embedding
.. autofunction:: linear
.. autofunction:: cosine_similarity
.. autofunction:: cross_entropy
oneflow.nn.Graph
============================================================
Base class for running neural networks in Static Graph Mode.
Currently, there are two main ways to run models in deep learning frameworks, namely dynamic graphs and static graphs , which are also conventionally referred to as :ref:`dynamic graph` and :ref:`static graph` in OneFlow.
Both approaches have their advantages and disadvantages, and OneFlow provides support for both approaches, with Eager mode being the default.
Generally speaking, dynamic graphs are easier to use and static graphs have more performance advantages. :class:`oneflow.nn.Graph` module is provided by OneFlow to allow users to build static graphs and train models with Eager-like programming conventions.
.. contents:: oneflow.nn.Graph
:depth: 2
:local:
:class: this-will-duplicate-information-and-it-is-still-useful-here
:backlinks: top
.. _dynamic graph:
Eager Mode to Static Graph Mode
------------------------------------------------------------
OneFlow runs in Eager mode by default.
OneFlow's nn.Graph is programmed in a style very similar to Eager Mode, so it is possible to make small changes and get large performance gains.
The following script shows the process of building a neural network in eager mode using the interface under ``oneflow.nn`` :
.. code-block::
import oneflow as flow
import oneflow.nn as nn
class ModuleMyLinear(nn.Module):
def __init__(self, in_features, out_features):
super().__init__()
self.weight = nn.Parameter(flow.randn(in_features, out_features))
self.bias = nn.Parameter(flow.randn(out_features))
def forward(self, input):
return flow.matmul(input, self.weight) + self.bias
linear_model = ModuleMyLinear(4, 3)
Eager ``nn.Module`` can be reused by ``nn.Graph``. The above script for eager mode can be changed to static Graph mode by adding just a few lines of code, which consists of the following steps:
- Define your customized graph as a subclass of ``nn.Graph``
- At the beginning of __init__. Call super().__init__() to let OneFlow do the necessary initialization of the Graph
- Reuse the ``nn.Module`` object in Eager mode in __init__ (self.model = model)
- Describe the computation in the ``build`` method
- Instantiate your graph then call it.
.. code-block::
class GraphMyLinear(nn.Graph):
def __init__(self):
super().__init__()
self.model = linear_model
def build(self, input):
return self.model(input)
graph_mylinear = GraphMyLinear()
input = flow.randn(1, 4)
out = graph_mylinear(input)
print(out)
tensor([[-0.3298, -3.7907, 0.1661]], dtype=oneflow.float32)
.. _static graph:
Static Graph Mode
------------------------------------------------------------
.. currentmodule:: oneflow.nn
.. autoclass:: oneflow.nn.Graph
:members: __init__,
build,
__call__,
add_optimizer,
set_grad_scaler,
state_dict,
load_state_dict,
name,
debug,
__repr__,
:member-order: bysource
.. autoclass:: oneflow.nn.graph.graph_config.GraphConfig
:members: enable_amp,
enable_zero,
allow_fuse_model_update_ops,
allow_fuse_add_to_output,
allow_fuse_cast_scale,
set_gradient_accumulation_steps,
enable_cudnn_conv_heuristic_search_algo,
enable_straighten_algorithm,
:member-order: bysource
.. autoclass:: oneflow.nn.graph.block_config.BlockConfig
:members: stage_id,
set_stage,
activation_checkpointing,
:member-order: bysource
Constructing a Graph
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Base class for training or evaluating a neural network in static graph mode.
.. currentmodule:: oneflow.nn.Graph
.. autosummary::
:toctree: generated
:nosignatures:
__init__
build
add_optimizer
set_grad_scaler
Executing a Graph
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Call a nn.Graph instance to run a customized graph.
.. currentmodule:: oneflow.nn.Graph
.. autosummary::
:toctree: generated
:nosignatures:
__call__
Config options on a Graph
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Optimization options of a nn.Graph.
.. currentmodule:: oneflow.nn.graph.graph_config.GraphConfig
.. autosummary::
:toctree: generated
:nosignatures:
enable_amp
enable_zero
allow_fuse_model_update_ops
allow_fuse_add_to_output
allow_fuse_cast_scale
set_gradient_accumulation_steps
enable_cudnn_conv_heuristic_search_algo
enable_straighten_algorithm
enable_compress_memory
Config options on a GraphModule
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
GraphModule is the graph representation of a nn.Module in a nn.Graph.
When an nn.Module is added into an nn.Graph, it is wrapped into a ProxyModule. The ProxyModule has a GraphModule inside it.
You can get and set the GraphModule to enable graph optimization on the nn.Module.
.. currentmodule:: oneflow.nn.graph.graph_block.GraphModule
.. autosummary::
:toctree: generated
:nosignatures:
set_stage
activation_checkpointing
Save & Load a Model
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. currentmodule:: oneflow.nn.Graph
.. autosummary::
:toctree: generated
:nosignatures:
state_dict
load_state_dict
Debug a Graph
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autosummary::
:toctree: generated
:nosignatures:
__repr__
debug
name
oneflow.hub
===================================
.. The documentation is referenced from:
https://pytorch.org/docs/1.10/hub.html
Oneflow Hub is a pre-trained model repository designed to facilitate research reproducibility.
Publishing models
-----------------
Oneflow Hub supports publishing pre-trained models(model definitions and pre-trained weights)
to a github repository by adding a simple ``hubconf.py`` file;
``hubconf.py`` can have multiple entrypoints. Each entrypoint is defined as a python function
(example: a pre-trained model you want to publish).
::
def entrypoint_name(*args, **kwargs):
# args & kwargs are optional, for models which take positional/keyword arguments.
...
How to implement an entrypoint?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Here is a code snippet specifies an entrypoint for ``resnet18`` model if we expand
the implementation in ``Oneflow-Inc/vision/hubconf.py``.
In most case importing the right function in ``hubconf.py`` is sufficient. Here we
just want to use the expanded version as an example to show how it works.
You can see the full script in
`Oneflow-Inc/vision repo <https://github.com/Oneflow-Inc/vision/blob/master/hubconf.py>`_
::
dependencies = ['oneflow']
from flowvision.models.resnet import resnet18 as _resnet18
# resnet18 is the name of entrypoint
def resnet18(pretrained=False, **kwargs):
""" # This docstring shows up in hub.help()
Resnet18 model
pretrained (bool): kwargs, load pretrained weights into the model
"""
# Call the model, load pretrained weights
model = _resnet18(pretrained=pretrained, **kwargs)
return model
- ``dependencies`` variable is a **list** of package names required to **load** the model. Note this might
be slightly different from dependencies required for training a model.
- ``args`` and ``kwargs`` are passed along to the real callable function.
- Docstring of the function works as a help message. It explains what does the model do and what
are the allowed positional/keyword arguments. It's highly recommended to add a few examples here.
- Entrypoint function can either return a model(nn.module), or auxiliary tools to make the user workflow smoother, e.g. tokenizers.
- Callables prefixed with underscore are considered as helper functions which won't show up in :func:`oneflow.hub.list()`.
- Pretrained weights can either be stored locally in the github repo, or loadable by
:func:`oneflow.hub.load_state_dict_from_url()`. If less than 2GB, it's recommended to attach it to a `project release <https://help.github.com/en/articles/distributing-large-binaries>`_
and use the url from the release.
In the example above ``flowvision.models.resnet.resnet18`` handles ``pretrained``, alternatively you can put the following logic in the entrypoint definition.
::
if pretrained:
# For checkpoint saved in local github repo, e.g. <RELATIVE_PATH_TO_CHECKPOINT>=weights/save.pth
dirname = os.path.dirname(__file__)
checkpoint = os.path.join(dirname, <RELATIVE_PATH_TO_CHECKPOINT>)
state_dict = oneflow.load(checkpoint)
model.load_state_dict(state_dict)
# For checkpoint saved elsewhere
checkpoint = 'https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip'
model.load_state_dict(oneflow.hub.load_state_dict_from_url(checkpoint, progress=False))
Important Notice
^^^^^^^^^^^^^^^^
- The published models should be at least in a branch/tag. It can't be a random commit.
Loading models from Hub
-----------------------
OneFlow Hub provides convenient APIs to explore all available models in hub
through :func:`oneflow.hub.list()`, show docstring and examples through
:func:`oneflow.hub.help()` and load the pre-trained models using
:func:`oneflow.hub.load()`.
.. automodule:: oneflow.hub
.. autofunction:: list
.. autofunction:: help
.. autofunction:: load
.. autofunction:: download_url_to_file
.. autofunction:: load_state_dict_from_url
Running a loaded model:
^^^^^^^^^^^^^^^^^^^^^^^
Note that ``*args`` and ``**kwargs`` in :func:`oneflow.hub.load()` are used to
**instantiate** a model. After you have loaded a model, how can you find out
what you can do with the model?
A suggested workflow is
- ``dir(model)`` to see all available methods of the model.
- ``help(model.foo)`` to check what arguments ``model.foo`` takes to run
To help users explore without referring to documentation back and forth, we strongly
recommend repo owners make function help messages clear and succinct. It's also helpful
to include a minimal working example.
Where are my downloaded models saved?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The locations are used in the order of
- Calling ``hub.set_dir(<PATH_TO_HUB_DIR>)``
- ``$ONEFLOW_HOME/hub``, if environment variable ``ONEFLOW_HOME`` is set.
- ``$XDG_CACHE_HOME/oneflow/hub``, if environment variable ``XDG_CACHE_HOME`` is set.
- ``~/.cache/oneflow/hub``
.. autofunction:: get_dir
.. autofunction:: set_dir
Caching logic
^^^^^^^^^^^^^
By default, we don't clean up files after loading it. Hub uses the cache by default if it already exists in the
directory returned by :func:`~oneflow.hub.get_dir()`.
Users can force a reload by calling ``hub.load(..., force_reload=True)``. This will delete
the existing github folder and downloaded weights, reinitialize a fresh download. This is useful
when updates are published to the same branch, users can keep up with the latest release.
Known limitations:
^^^^^^^^^^^^^^^^^^
Oneflow hub works by importing the package as if it was installed. There are some side effects
introduced by importing in Python. For example, you can see new items in Python caches
``sys.modules`` and ``sys.path_importer_cache`` which is normal Python behavior.
This also means that you may have import errors when importing different models
from different repos, if the repos have the same sub-package names (typically, a
``model`` subpackage). A workaround for these kinds of import errors is to
remove the offending sub-package from the ``sys.modules`` dict; more details can
be found in `this github issue
<https://github.com/pytorch/hub/issues/243#issuecomment-942403391>`_.
A known limitation that is worth mentioning here: users **CANNOT** load two different branches of
the same repo in the **same python process**. It's just like installing two packages with the
same name in Python, which is not good. Cache might join the party and give you surprises if you
actually try that. Of course it's totally fine to load them in separate processes.
......@@ -3,9 +3,14 @@ oneflow.nn.image
Image operations for neural networks
--------------------------------------
.. currentmodule:: oneflow.nn.image
.. automodule:: oneflow.nn.image
:members: Resize,
batch_align,
decode,
flip,
normalize
.. autosummary::
:toctree: generated
:nosignatures:
Resize
batch_align
decode
flip
normalize
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment