dtk

8f7de847 · yuguo960516yuguo · f262efc9 · 8f7de847 · f262efc9 · f262efc9
Commit 8f7de847 authored Apr 25, 2023 by yuguo960516yuguo
20 changed files
--- a/README.md
+++ b/README.md
-# OneFlow
+# OneFlow 
-OneFlow is a deep learning framework designed to be **user-friendly, scalable and efficient**. With OneFlow, it is easy to:
+**OneFlow is a performance-centered and open-source deep learning framework.**
- program a model with **PyTorch-like API**
- scale a model to n-dimensional-parallel/distributed execution with the **Global View API**
- accelerate/deploy a model with the **Static Graph Compiler**.
 [![Simple CI](https://github.com/Oneflow-Inc/oneflow/actions/workflows/simple.yml/badge.svg)](https://github.com/Oneflow-Inc/oneflow/actions/workflows/simple.yml)
 [![Nightly Docker Image](https://github.com/Oneflow-Inc/docker-images/actions/workflows/oneflow-nightly.yml/badge.svg)](https://github.com/Oneflow-Inc/docker-images/actions/workflows/oneflow-nightly.yml)
@@ -12,8 +9,10 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an
 ## Latest News
- Version 0.8.0 is out!
+- Version 0.7.0 is out!
-  - [Full changelog](https://github.com/Oneflow-Inc/oneflow/releases/tag/v0.8.0)
+  - Introducing global tensor
+  - Semi-auto parallelization has landed
+  - [Full changelog](https://github.com/Oneflow-Inc/oneflow/releases/tag/v0.7.0)
 ## Publication
@@ -36,7 +35,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an
 ### System Requirements
 - Linux. As for now, there is no pre-built release for macOS, Windows.
- Python 3.7, 3.8, 3.9, 3.10
+- Python 3.6, 3.7, 3.8, 3.9, 3.10
 - (**Highly recommended**) Upgrade pip
  ```
@@ -54,7 +53,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an
 - To install latest stable release of OneFlow with CUDA support:
  ```bash
-  python3 -m pip install oneflow
+  python3 -m pip install -f https://release.oneflow.info oneflow==0.7.0+cu102
  ```
 - To install nightly release of OneFlow with CUDA support:
@@ -67,7 +66,7 @@ OneFlow is a deep learning framework designed to be **user-friendly, scalable an
  - Stable
    ```bash
-    python3 -m pip install --find-links https://release.oneflow.info oneflow==0.8.0+[PLATFORM]
+    python3 -m pip install --find-links https://release.oneflow.info oneflow==0.7.0+[PLATFORM]
    ```
  - Nightly
    ```

--- a/audit_dtk-22.04.2.py
+++ b/audit_dtk-22.04.2.py
-# Monkey patch to not ship libjvm.so in pypi wheels
-import sys
-from auditwheel.main import main
-from auditwheel.policy import _POLICIES as POLICIES
-# libjvm is loaded dynamically; do not include it
-for p in POLICIES:
-   p['lib_whitelist'].append('librccl.so.1')
-   p['lib_whitelist'].append('libhipblas.so.0')
-   p['lib_whitelist'].append('libhiprand.so.1')
-   p['lib_whitelist'].append('librocrand.so.1')
-   p['lib_whitelist'].append('libMIOpen.so.1')
-   p['lib_whitelist'].append('libgalaxyhip.so.4')
-   p['lib_whitelist'].append('librocm_smi64.so.2')
-   p['lib_whitelist'].append('librocsolver.so.0 ')
-   p['lib_whitelist'].append('librocblas.so.0')
-if __name__ == "__main__":
-   sys.exit(main())
--- a/audit_dtk-22.10.py
+++ b/audit_dtk-22.10.py
-# Monkey patch to not ship libjvm.so in pypi wheels
-import sys
-from auditwheel.main import main
-from auditwheel.policy import _POLICIES as POLICIES
-# libjvm is loaded dynamically; do not include it
-for p in POLICIES:
-   p['lib_whitelist'].append('librccl.so.1')
-   p['lib_whitelist'].append('libhipblas.so.0')
-   p['lib_whitelist'].append('libhiprand.so.1')
-   p['lib_whitelist'].append('librocrand.so.1')
-   p['lib_whitelist'].append('libMIOpen.so.1')
-   p['lib_whitelist'].append('libgalaxyhip.so.5')
-   p['lib_whitelist'].append('librocm_smi64.so.2')
-   p['lib_whitelist'].append('librocsolver.so.0 ')
-   p['lib_whitelist'].append('librocblas.so.0')
-if __name__ == "__main__":
-   sys.exit(main())
--- a/cmake/oneflow.cmake
+++ b/cmake/oneflow.cmake
@@ -328,6 +328,17 @@ if(BUILD_PYTHON OR BUILD_CPP_API)
  endif()
 endif()
+if (BUILD_ROCM)
+  # AMD compiler fails to compile these three files with '-O1/2/3'.
+  # The value of `COMPILE_OPTIONS` target property is added after CMAKE_<LANG>_FLAGS_<CONFIG>,
+  # so '-O0' will override '-O1/2/3'.
+  set_source_files_properties(${PROJECT_SOURCE_DIR}/oneflow/user/kernels/median_with_indices_kernel.hip.cpp
+                              ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/radix_sort_top_k_kernel.hip.cpp
+                              ${PROJECT_SOURCE_DIR}/oneflow/user/kernels/arg_sort_kernel.hip.cpp
+                              # ${PROJECT_SOURCE_DIR}/oneflow/core/ep/cuda/primitive/broadcast_elementwise_binary_math.hip.cpp
+                              PROPERTIES COMPILE_OPTIONS "-O0")
+endif()
 if(BUILD_PYTHON)
  # py ext lib

--- a/oneflow/core/embedding/cached_key_value_store.hip.cpp
+++ b/oneflow/core/embedding/cached_key_value_store.hip.cpp
--- a/oneflow/core/embedding/full_cache.hip.cpp
+++ b/oneflow/core/embedding/full_cache.hip.cpp
--- a/oneflow/core/embedding/hash_functions.hip.h
+++ b/oneflow/core/embedding/hash_functions.hip.h
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_
 #define ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_
 #include <stdint.h>
 #include "oneflow/core/common/data_type.h"
 namespace oneflow {
 namespace embedding {
 namespace {
 // From https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
 static const uint64_t PRIME64_1 =
    0x9E3779B185EBCA87ULL;  // 0b1001111000110111011110011011000110000101111010111100101010000111
 static const uint64_t PRIME64_2 =
    0xC2B2AE3D27D4EB4FULL;  // 0b1100001010110010101011100011110100100111110101001110101101001111
 static const uint64_t PRIME64_3 =
    0x165667B19E3779F9ULL;  // 0b0001011001010110011001111011000110011110001101110111100111111001
 static const uint64_t PRIME64_4 =
    0x85EBCA77C2B2AE63ULL;  // 0b1000010111101011110010100111011111000010101100101010111001100011
 static const uint64_t PRIME64_5 =
    0x27D4EB2F165667C5ULL;  // 0b0010011111010100111010110010111100010110010101100110011111000101
 #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
 OF_DEVICE_FUNC uint64_t XXH64_round(uint64_t acc, uint64_t input) {
  acc += input * PRIME64_2;
  acc = XXH_rotl64(acc, 31);
  acc *= PRIME64_1;
  return acc;
 }
 OF_DEVICE_FUNC uint64_t xxh64_uint64(uint64_t v, uint64_t seed) {
  uint64_t acc = seed + PRIME64_5;
  acc += sizeof(uint64_t);
  acc = acc ^ XXH64_round(0, v);
  acc = XXH_rotl64(acc, 27) * PRIME64_1;
  acc = acc + PRIME64_4;
  acc ^= (acc >> 33);
  acc = acc * PRIME64_2;
  acc = acc ^ (acc >> 29);
  acc = acc * PRIME64_3;
  acc = acc ^ (acc >> 32);
  return acc;
 }
 static const size_t kShardingHashSeed = 1;
 static const size_t kLocalUniqueHashSeed = 2;
 static const size_t kGlobalUniqueHashSeed = 3;
 static const size_t kFullCacheHashSeed = 4;
 static const size_t kLruCacheHashSeed = 5;
 }  // namespace
 struct ShardingHash {
  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kShardingHashSeed); }
  OF_DEVICE_FUNC size_t operator()(uint32_t v) { return xxh64_uint64(v, kShardingHashSeed); }
  OF_DEVICE_FUNC size_t operator()(int32_t v) {
    return xxh64_uint64(static_cast<uint32_t>(v), kShardingHashSeed);
  }
  OF_DEVICE_FUNC size_t operator()(int64_t v) {
    return xxh64_uint64(static_cast<uint64_t>(v), kShardingHashSeed);
  }
 };
 struct LocalUniqueHash {
  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLocalUniqueHashSeed); }
 };
 struct GlobalUniqueHash {
  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kGlobalUniqueHashSeed); }
 };
 struct FullCacheHash {
  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kFullCacheHashSeed); }
 };
 struct LruCacheHash {
  OF_DEVICE_FUNC size_t operator()(uint64_t v) { return xxh64_uint64(v, kLruCacheHashSeed); }
 };
 }  // namespace embedding
 }  // namespace oneflow
 #endif  // ONEFLOW_CORE_EMBEDDING_HASH_FUNCTION_HIP_H_
\ No newline at end of file
--- a/oneflow/core/embedding/lru_cache.hip.cpp
+++ b/oneflow/core/embedding/lru_cache.hip.cpp
--- a/oneflow/core/embedding/mock_key_value_store.hip.cpp
+++ b/oneflow/core/embedding/mock_key_value_store.hip.cpp
--- a/oneflow/core/embedding/persistent_table_key_value_store.hip.cpp
+++ b/oneflow/core/embedding/persistent_table_key_value_store.hip.cpp
--- a/oneflow/core/ep/rocm/cuda_device.cpp
+++ b/oneflow/core/ep/rocm/cuda_device.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/core/ep/rocm/cuda_device.h"
 #include "oneflow/core/ep/rocm/cuda_event.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 #ifdef WITH_ROCM
 #include <hip/hip_runtime.h>
 #include <hip/hip_fp16.h>
 // #if CUDA_VERSION >= 11000
 // #include <cuda_bf16.h>
 // #endif
 namespace oneflow {
 namespace ep {
 namespace {
 constexpr size_t kDefaultConstBufElementCount = 1024 * 1024;
 template<typename T>
 void CreateConstBuffer(void** buf, T value, size_t n) {
  OF_CUDA_CHECK(hipMalloc(buf, n * sizeof(T)));
  std::vector<T> host(n, value);
  OF_CUDA_CHECK(hipMemcpy(*buf, host.data(), n * sizeof(T), hipMemcpyDefault));
 }
 }  // namespace
 CudaDevice::CudaDevice(int device_index, DeviceManager* device_manager)
    : device_index_(device_index),
      event_flags_{},
      properties_{},
      device_manager_(device_manager),
      const_buf_elem_cnt_(0),
      const_zeros_buffer_(nullptr),
      const_ones_buffer_fp32_(nullptr),
      const_ones_buffer_fp16_(nullptr),
      const_ones_buffer_bf16_(nullptr) {
  CudaCurrentDeviceGuard guard(device_index_);
  OF_CUDA_CHECK(hipGetDeviceProperties(&properties_, device_index_));
  event_flags_ = hipEventDisableTiming;
  if (ParseBooleanFromEnv("ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC", false)) {
    event_flags_ |= hipEventBlockingSync;
  }
  const_buf_elem_cnt_ = ParseIntegerFromEnv("ONEFLOW_EP_CUDA_CONST_BUFFER_ELEMENT_COUNT",
                                            kDefaultConstBufElementCount);
  if (const_buf_elem_cnt_ > 0) {
    CreateConstBuffer<float>(&const_zeros_buffer_, static_cast<float>(0), const_buf_elem_cnt_);
    CreateConstBuffer<float>(&const_ones_buffer_fp32_, static_cast<float>(1.0),
                             const_buf_elem_cnt_);
    CreateConstBuffer<half>(&const_ones_buffer_fp16_, static_cast<half>(1.0), const_buf_elem_cnt_);
 // #if CUDA_VERSION >= 11000
 //     CreateConstBuffer<nv_bfloat16>(&const_ones_buffer_bf16_, static_cast<nv_bfloat16>(1.0),
 //                                    const_buf_elem_cnt_);
 // #endif
  }
 }
 CudaDevice::~CudaDevice() {
  CudaCurrentDeviceGuard guard(device_index_);
  for (auto* event : events_) { delete event; }
  OF_CUDA_CHECK(hipFree(const_zeros_buffer_));
  OF_CUDA_CHECK(hipFree(const_ones_buffer_fp32_));
  OF_CUDA_CHECK(hipFree(const_ones_buffer_fp16_));
  OF_CUDA_CHECK(hipFree(const_ones_buffer_bf16_));
 }
 void CudaDevice::SetAsActiveDevice() { OF_CUDA_CHECK(hipSetDevice(device_index_)); }
 Stream* CudaDevice::CreateStream() {
  CudaCurrentDeviceGuard guard(device_index_);
  return new CudaStream(this);
 }
 void CudaDevice::DestroyStream(Stream* stream) {
  CudaCurrentDeviceGuard guard(device_index_);
  delete stream;
 }
 void CudaDevice::CreateEvents(Event** events, size_t count) {
  size_t copied = 0;
  {
    std::lock_guard<std::mutex> lock(events_mutex_);
    copied = std::min(count, events_.size());
    size_t offset = events_.size() - copied;
    std::copy(events_.begin() + offset, events_.end(), events);
    events_.resize(offset);
  }
  if (copied != count) {
    CudaCurrentDeviceGuard guard(device_index_);
    for (size_t i = copied; i < count; ++i) { events[i] = new CudaEvent(event_flags_); }
  }
 }
 void CudaDevice::DestroyEvents(Event** events, size_t count) {
  std::lock_guard<std::mutex> lock(events_mutex_);
  events_.insert(events_.end(), events, events + count);
 }
 Maybe<void> CudaDevice::Alloc(const AllocationOptions& options, void** ptr, size_t size) {
  CudaCurrentDeviceGuard guard(device_index_);
  CHECK(!options.HasPinnedDevice());
  hipError_t err = hipMalloc(ptr, size);
  if (err != hipSuccess) {
    return Error::RuntimeError() << hipGetErrorString(err);
  } else {
    return Maybe<void>::Ok();
  }
 }
 void CudaDevice::Free(const AllocationOptions& attr, void* ptr) {
  CudaCurrentDeviceGuard guard(device_index_);
  OF_CUDA_CHECK(hipFree(ptr));
 }
 Maybe<void> CudaDevice::AllocPinned(const AllocationOptions& options, void** ptr, size_t size) {
  CudaCurrentDeviceGuard guard(device_index_);
  hipError_t err = NumaAwareCudaMallocHost(device_index_, ptr, size);
  if (err != hipSuccess) {
    return Error::RuntimeError() << hipGetErrorString(err);
  } else {
    return Maybe<void>::Ok();
  }
 }
 void CudaDevice::FreePinned(const AllocationOptions& options, void* ptr) {
  CudaCurrentDeviceGuard guard(device_index_);
  OF_CUDA_CHECK(hipHostFree(ptr));
 }
 const hipDeviceProp_t& CudaDevice::properties() const { return properties_; }
 const void* CudaDevice::GetConstZeros(DataType data_type, size_t n) const {
  if (GetSizeOfDataType(data_type) * n
      <= GetSizeOfDataType(DataType::kFloat) * const_buf_elem_cnt_) {
    return const_zeros_buffer_;
  } else {
    return nullptr;
  }
 }
 const void* CudaDevice::GetConstOnes(DataType data_type, size_t n) const {
  if (n <= const_buf_elem_cnt_) {
    if (data_type == DataType::kFloat) {
      return const_ones_buffer_fp32_;
    } else if (data_type == DataType::kFloat16) {
      return const_ones_buffer_fp16_;
    } else if (data_type == DataType::kBFloat16) {
      return const_ones_buffer_bf16_;
    } else {
      return nullptr;
    }
  } else {
    return nullptr;
  }
 }
 }  // namespace ep
 }  // namespace oneflow
 #endif  // WITH_ROCM
--- a/oneflow/core/ep/rocm/cuda_device.h
+++ b/oneflow/core/ep/rocm/cuda_device.h
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
 #define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
 #include "oneflow/core/ep/include/device.h"
 #include "oneflow/core/common/data_type.h"
 #ifdef WITH_ROCM
 #include <hip/hip_runtime.h>
 namespace oneflow {
 namespace ep {
 class CudaDevice : public Device {
 public:
  OF_DISALLOW_COPY_AND_MOVE(CudaDevice);
  explicit CudaDevice(int device_index, DeviceManager* device_manager);
  ~CudaDevice() override;
  void SetAsActiveDevice() override;
  DeviceType device_type() const override { return DeviceType::kCUDA; }
  size_t device_index() const override { return device_index_; }
  DeviceManager* device_manager() const override { return device_manager_; }
  Stream* CreateStream() override;
  void DestroyStream(Stream* stream) override;
  void CreateEvents(Event** events, size_t count) override;
  void DestroyEvents(Event** events, size_t count) override;
  Maybe<void> Alloc(const AllocationOptions& options, void** ptr, size_t size) override;
  void Free(const AllocationOptions& options, void* ptr) override;
  Maybe<void> AllocPinned(const AllocationOptions& options, void** ptr, size_t size) override;
  void FreePinned(const AllocationOptions& options, void* ptr) override;
  const hipDeviceProp_t& properties() const;
  const void* GetConstZeros(DataType data_type, size_t n) const;
  const void* GetConstOnes(DataType data_type, size_t n) const;
 private:
  int device_index_;
  std::mutex events_mutex_;
  std::vector<Event*> events_;
  unsigned int event_flags_;
  hipDeviceProp_t properties_;
  DeviceManager* device_manager_;
  int64_t const_buf_elem_cnt_;
  void* const_zeros_buffer_;
  void* const_ones_buffer_fp32_;
  void* const_ones_buffer_fp16_;
  void* const_ones_buffer_bf16_;
 };
 }  // namespace ep
 }  // namespace oneflow
 #endif  // WITH_ROCM
 #endif  // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
--- a/oneflow/core/ep/rocm/cuda_device_manager.cpp
+++ b/oneflow/core/ep/rocm/cuda_device_manager.cpp
--- a/oneflow/core/ep/rocm/cuda_device_manager.h
+++ b/oneflow/core/ep/rocm/cuda_device_manager.h
--- a/oneflow/core/ep/rocm/cuda_device_manager_factory.cpp
+++ b/oneflow/core/ep/rocm/cuda_device_manager_factory.cpp
--- a/oneflow/core/ep/rocm/cuda_event.cpp
+++ b/oneflow/core/ep/rocm/cuda_event.cpp
--- a/oneflow/core/ep/rocm/cuda_event.h
+++ b/oneflow/core/ep/rocm/cuda_event.h
--- a/oneflow/core/ep/rocm/cuda_stream.cpp
+++ b/oneflow/core/ep/rocm/cuda_stream.cpp
--- a/oneflow/core/ep/rocm/cuda_stream.h
+++ b/oneflow/core/ep/rocm/cuda_stream.h
--- a/oneflow/core/ep/rocm/primitive/add.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/add.hip.cpp