trival files migration

460b49f4 · yuguo · 5e07668d · 460b49f4 · 460b49f4 · 460b49f4
Commit 460b49f4 authored Nov 02, 2022 by yuguo
18 changed files
--- a/oneflow/api/cpp/tests/graph_test.cpp
+++ b/oneflow/api/cpp/tests/graph_test.cpp
@@ -84,6 +84,26 @@ TEST(Api, graph_multi_gpu_test) {
 }
 #endif

+#ifdef WITH_ROCM
+TEST(Api, graph_gpu_test) {
+  EnvScope scope;
+  Device device("cuda", 0);
+  Graph graph = LoadGraph(device);
+  Forward(graph, device);
+}
+
+TEST(Api, graph_multi_gpu_test) {
+  EnvScope scope;
+  Device device("cuda", 0);
+  Graph graph = LoadGraph(device);
+  Forward(graph, device);
+
+  Device device1("cuda", 1);
+  Graph graph1 = LoadGraph(device1);
+  Forward(graph1, device1);
+}
+#endif
+
 TEST(Api, graph_cpu_batching_test) {
  EnvScope scope;
  Device device("cpu");

--- a/oneflow/api/cpp/tests/tensor_test.cpp
+++ b/oneflow/api/cpp/tests/tensor_test.cpp
@@ -34,6 +34,15 @@ TEST(Api, device) {
  ASSERT_EQ(device.type(), "cuda");
  ASSERT_EQ(device.device_id(), 1);
 #endif
+#ifdef WITH_ROCM
+  device = Device("cuda:0");
+  ASSERT_EQ(device.type(), "cuda");
+  ASSERT_EQ(device.device_id(), 0);
+
+  device = Device("cuda", 1);
+  ASSERT_EQ(device.type(), "cuda");
+  ASSERT_EQ(device.device_id(), 1);
+#endif
 }

 TEST(Api, tensor) {

--- a/oneflow/api/python/flags.cpp
+++ b/oneflow/api/python/flags.cpp
@@ -17,12 +17,15 @@ limitations under the License.
 #ifdef WITH_CUDA
 #include <cuda.h>
 #endif
+#ifdef WITH_ROCM
+#include <hip/hip_runtime.h>
+#endif

 namespace oneflow {

 ONEFLOW_API_PYBIND11_MODULE("flags", m) {
  m.def("with_cuda", []() {
-#ifdef WITH_CUDA
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    return true;
 #else
    return false;

--- a/oneflow/core/ep/test/primitive/fill_test.cpp
+++ b/oneflow/core/ep/test/primitive/fill_test.cpp
@@ -27,6 +27,11 @@ limitations under the License.
 #endif  // CUDA_VERSION >= 11000
 #endif  // WITH_CUDA

+#ifdef WITH_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#endif  // WITH_ROCM
+
 namespace oneflow {

 namespace ep {
@@ -87,6 +92,9 @@ TEST_F(PrimitiveTest, TestFill) {
                                             1024);
 #endif  // CUDA_VERSION >= 11000
 #endif  // WITH_CUDA
+// #ifdef WITH_ROCM
+//   TestFill<DataType::kFloat16, half>(&device_manager_registry_, available_device_types_, 1024);
+// #endif  // WITH_ROCM
  TestFill<DataType::kBool, bool>(&device_manager_registry_, available_device_types_, 1024);
 }


--- a/oneflow/core/framework/multi_client_session_context.cpp
+++ b/oneflow/core/framework/multi_client_session_context.cpp
@@ -41,6 +41,9 @@ limitations under the License.
 #ifdef WITH_CUDA
 #include <cuda.h>
 #endif  // WITH_CUDA
+#ifdef WITH_ROCM
+#include <hip/hip_runtime.h>
+#endif  // WITH_ROCM

 namespace oneflow {


--- a/oneflow/core/functional/impl/array_functor.cpp
+++ b/oneflow/core/functional/impl/array_functor.cpp
@@ -1389,6 +1389,9 @@ class CopyFunctor {

 #ifdef WITH_CUDA
    if (device_type == "cuda") { InitCudaContextOnce(device_id); }
+#endif
+#ifdef WITH_ROCM
+    if (device_type == "cuda") { InitCudaContextOnce(device_id); }
 #endif
    return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
  }

--- a/oneflow/core/graph_impl/decode_h2d_compute_task_node.cpp
+++ b/oneflow/core/graph_impl/decode_h2d_compute_task_node.cpp
@@ -62,6 +62,12 @@ REGISTER_NAMED_TASK_STREAM_INDEX_GETTER(DeviceType::kCUDA, TaskType::kDecodeH2D,

 #endif

+#ifdef WITH_ROCM
+
+REGISTER_NAMED_TASK_STREAM_INDEX_GETTER(DeviceType::kCUDA, TaskType::kDecodeH2D, "DECODE_H2D")
+
+#endif
+
 namespace {

 CompTaskNode* CreateCompTaskNodeByOpDeviceType(const OperatorConf& op_conf) {

--- a/oneflow/core/job/runtime.cpp
+++ b/oneflow/core/job/runtime.cpp
@@ -73,6 +73,9 @@ Runtime::Runtime(
 #ifdef WITH_CUDA
    Singleton<EagerNcclCommMgr>::Get()->CreateCommFromPlan(plan);
 #endif  // WITH_CUDA
+#ifdef WITH_ROCM
+    Singleton<EagerNcclCommMgr>::Get()->CreateCommFromPlan(plan);
+#endif  // WITH_ROCM
  }
  std::vector<const TaskProto*> source_tasks;
  source_tasks.reserve(plan.task().size());

--- a/oneflow/core/kernel/broadcast_to_compatible_with_kernel.cpp
+++ b/oneflow/core/kernel/broadcast_to_compatible_with_kernel.cpp
@@ -59,5 +59,8 @@ OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL,
 #if defined(WITH_CUDA)
 REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL(DeviceType::kCUDA, (float16, DataType::kFloat16))
 #endif
+#if defined(WITH_ROCM)
+REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL(DeviceType::kCUDA, (float16, DataType::kFloat16))
+#endif

 }  // namespace oneflow
--- a/oneflow/core/kernel/foreign_watch_kernel.cpp
+++ b/oneflow/core/kernel/foreign_watch_kernel.cpp
@@ -48,5 +48,9 @@ REGISTER_KERNEL_WITH_DEVICE(OperatorConf::kForeignWatchConf, DeviceType::kCPU,
 REGISTER_KERNEL_WITH_DEVICE(OperatorConf::kForeignWatchConf, DeviceType::kCUDA,
                            ForeignWatchKernel<DeviceType::kCUDA>);
 #endif
+#ifdef WITH_ROCM
+REGISTER_KERNEL_WITH_DEVICE(OperatorConf::kForeignWatchConf, DeviceType::kCUDA,
+                            ForeignWatchKernel<DeviceType::kCUDA>);
+#endif

 }  // namespace oneflow
--- a/oneflow/core/kernel/sync_dynamic_resize_kernel.cpp
+++ b/oneflow/core/kernel/sync_dynamic_resize_kernel.cpp
@@ -114,6 +114,92 @@ REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int64_t);

 #endif  // WITH_CUDA

+#ifdef WITH_ROCM
+
+namespace {
+
+class CudaHostMem {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CudaHostMem);
+  CudaHostMem(const size_t size) { OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&ptr_), size)); }
+  ~CudaHostMem() { OF_CUDA_CHECK(hipHostFree(ptr_)); }
+  void* Ptr() const { return ptr_; }
+
+ private:
+  void* ptr_;
+};
+
+}  // namespace
+
+template<typename SizeType>
+class SyncDynamicResizeGPUKernel final : public Kernel {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(SyncDynamicResizeGPUKernel);
+  SyncDynamicResizeGPUKernel() = default;
+  ~SyncDynamicResizeGPUKernel() override = default;
+
+ private:
+  bool IsKernelLaunchSynchronized() const override { return false; }
+
+  void ForwardDataContent(KernelContext* ctx) const override {
+    const SyncDynamicResizeOpConf& conf = this->op_conf().sync_dynamic_resize_conf();
+    CHECK_EQ(conf.axis(), 0);
+    std::shared_ptr<CudaHostMem> cuda_host_mem_ptr;
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (queue_.empty()) {
+        cuda_host_mem_ptr.reset(new CudaHostMem(sizeof(SizeType)));
+      } else {
+        cuda_host_mem_ptr = queue_.front();
+        queue_.pop();
+      }
+    }
+    const Blob* in = ctx->BnInOp2Blob("in");
+    const Blob* size = ctx->BnInOp2Blob("size");
+    Blob* out = ctx->BnInOp2Blob("out");
+    AutoMemcpy(ctx->stream(), out->mut_dptr(), in->dptr(), in->ByteSizeOfBlobBody(),
+               out->mem_case(), in->mem_case());
+    AutoMemcpy(ctx->stream(), cuda_host_mem_ptr->Ptr(), size->dptr(), sizeof(SizeType),
+               MakeHostMemCase(), size->mem_case());
+    const auto& UpdateShape = [out, cuda_host_mem_ptr, conf, this]() {
+      const int64_t new_size = *reinterpret_cast<SizeType*>(cuda_host_mem_ptr->Ptr());
+      CHECK_GE(new_size, 0);
+      CHECK_LE(new_size, out->shape_view().At(conf.axis()));
+      // NOTE(Liang Depeng): `mut_shape_view` should be used here to get the blob's `MutShapeView`
+      //                     pointer. But this callback is called after `Kernel::Forward` function's
+      //                     execution and the header check is already been set to false at that
+      //                     moment. So we have to choose the `ForceMutShapeView` function with
+      //                     header checker disabled.
+      out->ForceMutShapeView()->Set(conf.axis(), new_size);
+      std::lock_guard<std::mutex> lock(mutex_);
+      queue_.push(cuda_host_mem_ptr);
+    };
+    if (conf.eager()) {
+      CHECK_JUST(ctx->stream()->Sync());
+      UpdateShape();
+    } else {
+      auto* actor_context_provider = CHECK_NOTNULL(dynamic_cast<ActorContextProvider*>(ctx));
+      actor_context_provider->GetActorContext()->AddCallback(UpdateShape);
+    }
+  }
+
+  mutable std::queue<std::shared_ptr<CudaHostMem>> queue_;
+  mutable std::mutex mutex_;
+};
+
+#define REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(stype)                                         \
+  NEW_REGISTER_KERNEL(OperatorConf::kSyncDynamicResizeConf, SyncDynamicResizeGPUKernel<stype>) \
+      .SetIsMatchedPred([](const KernelConf& kernel_conf) {                                    \
+        return (kernel_conf.op_attribute().op_conf().device_tag() == "cuda"                    \
+                && GetDataType<stype>::value                                                   \
+                       == kernel_conf.sync_dynamic_resize_conf().size_data_type());            \
+      })
+REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int8_t);
+REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int32_t);
+REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int64_t);
+
+#endif  // WITH_ROCM
+
 template<typename SizeType>
 class SyncDynamicResizeCPUKernel final : public Kernel {
 public:

--- a/oneflow/core/kernel/user_kernel.h
+++ b/oneflow/core/kernel/user_kernel.h
@@ -29,6 +29,12 @@ limitations under the License.

 #endif  // WITH_CUDA

+#ifdef WITH_ROCM
+
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+#endif  // WITH_ROCM
+
 namespace oneflow {

 class UserKernelComputeContext;

--- a/oneflow/core/lazy/actor/light_actor.cpp
+++ b/oneflow/core/lazy/actor/light_actor.cpp
@@ -35,6 +35,12 @@ limitations under the License.

 #endif  // WITH_CUDA

+#ifdef WITH_ROCM
+
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+#endif  // WITH_ROCM
+
 namespace oneflow {

 namespace {

--- a/oneflow/core/lazy/actor/naive_actor.cpp
+++ b/oneflow/core/lazy/actor/naive_actor.cpp
@@ -39,5 +39,8 @@ REGISTER_ACTOR(TaskType::kCriticalSectionWaitTick, NaiveActor);
 #ifdef WITH_CUDA
 REGISTER_ACTOR(TaskType::kCopyHd, NaiveActor);
 #endif
+#ifdef WITH_ROCM
+REGISTER_ACTOR(TaskType::kCopyHd, NaiveActor);
+#endif
 REGISTER_ACTOR(TaskType::kCollectiveBoxingGeneric, NaiveActor);
 }  // namespace oneflow
--- a/oneflow/core/vm/bin_allocator_test.cpp
+++ b/oneflow/core/vm/bin_allocator_test.cpp
@@ -129,3 +129,122 @@ TEST(CudaBinAllocator, cuda_allocator) {
 }  // namespace oneflow

 #endif  // WITH_CUDA
+
+#ifdef WITH_ROCM
+
+#include "gtest/gtest.h"
+#include "oneflow/core/vm/bin_allocator.h"
+#include "oneflow/core/vm/thread_safe_allocator.h"
+#include "oneflow/core/device/cuda_util.h"
+
+namespace oneflow {
+namespace vm {
+
+class CudaBackendAllocator final : public Allocator {
+ public:
+  explicit CudaBackendAllocator(int64_t device_id) : device_id_(device_id) {}
+  ~CudaBackendAllocator() override = default;
+
+  Maybe<void> Allocate(char** mem_ptr, std::size_t size) override;
+  void Deallocate(char* mem_ptr, std::size_t size) override;
+  void DeviceReset() override;
+
+ private:
+  int64_t device_id_;
+};
+
+Maybe<void> CudaBackendAllocator::Allocate(char** mem_ptr, std::size_t size) {
+  hipSetDevice(device_id_);
+  if (hipMalloc(mem_ptr, size) != hipSuccess) { *mem_ptr = nullptr; }
+  return Maybe<void>::Ok();
+}
+
+void CudaBackendAllocator::Deallocate(char* mem_ptr, std::size_t size) {
+  hipSetDevice(device_id_);
+  OF_CUDA_CHECK(hipFree(mem_ptr));
+}
+
+void CudaBackendAllocator::DeviceReset() {
+  hipSetDevice(device_id_);
+  // NOTE(chengcheng): In some corner case on ubuntu, cuda memory not released even if OOM.
+  //   So there need release all cuda memory allocated by this process before core dump.
+  LOG(WARNING) << "OOM error is detected, process will exit. And it will start to reset CUDA "
+               << "device for releasing device memory.";
+  OF_CUDA_CHECK(hipDeviceReset());
+}
+
+TEST(CudaBinAllocator, cuda_allocator) {
+  int gpu_num = -1;
+  hipGetDeviceCount(&gpu_num);
+  if (gpu_num <= 0) {
+    LOG(INFO) << "CudaBinAllocator Test: Skip because of non GPU device.";
+    return;
+  }
+  ASSERT_TRUE(hipSuccess == hipSetDevice(0));
+  size_t free_bytes = -1;
+  size_t total_bytes = -1;
+  const size_t remain_bytes = 50 * 1048576;
+  ASSERT_TRUE(hipSuccess == hipMemGetInfo(&free_bytes, &total_bytes));
+  if (free_bytes <= remain_bytes || free_bytes - remain_bytes < remain_bytes) {
+    LOG(INFO)
+        << "CudaBinAllocator Test: Skip because of allocator mem bytes less than 50MiB in GPU 0";
+    return;
+  }
+  std::unique_ptr<Allocator> allo(
+      new BinAllocator(kCudaMemAllocAlignSize, std::make_unique<CudaBackendAllocator>(0)));
+  allo.reset(new SingleThreadOnlyAllocator(std::move(allo)));
+  Allocator* a = allo.get();
+  std::vector<char*> ptrs;
+  for (int i = 0; i < 512; ++i) {
+    char* ptr = nullptr;
+    CHECK_JUST(a->Allocate(&ptr, 1));
+    ASSERT_TRUE(ptr != nullptr);
+    ptrs.emplace_back(ptr);
+  }
+  std::sort(ptrs.begin(), ptrs.end());
+  for (int i = 0; i < 512; ++i) {
+    if (i > 0) {
+      ASSERT_TRUE(ptrs.at(i) != ptrs.at(i - 1));
+      ASSERT_TRUE(std::abs(ptrs.at(i) - ptrs.at(i - 1)) >= kCudaMemAllocAlignSize);
+    }
+    a->Deallocate(ptrs.at(i), 1);
+  }
+
+  ptrs.clear();
+  for (int i = 0; i < 2048; ++i) {
+    char* ptr = nullptr;
+    CHECK_JUST(a->Allocate(&ptr, 10000));
+    ASSERT_TRUE(ptr != nullptr);
+    ptrs.emplace_back(ptr);
+  }
+  std::sort(ptrs.begin(), ptrs.end());
+  for (int i = 0; i < 2048; ++i) {
+    if (i > 0) {
+      ASSERT_TRUE(ptrs.at(i) != ptrs.at(i - 1));
+      ASSERT_TRUE(std::abs(ptrs.at(i) - ptrs.at(i - 1)) >= kCudaMemAllocAlignSize);
+    }
+    a->Deallocate(ptrs.at(i), 10000);
+  }
+
+  char* data_ptr_1 = nullptr;
+  CHECK_JUST(a->Allocate(&data_ptr_1, 2048 * sizeof(float)));
+
+  char* data_ptr_2 = nullptr;
+  CHECK_JUST(a->Allocate(&data_ptr_2, 4096 * sizeof(double)));
+
+  ASSERT_TRUE(data_ptr_1 != data_ptr_2);
+  if (data_ptr_1 < data_ptr_2) {
+    ASSERT_TRUE(data_ptr_1 + 2048 * sizeof(float) <= data_ptr_2);
+  } else {
+    ASSERT_TRUE(data_ptr_2 + 4096 * sizeof(double) <= data_ptr_1);
+  }
+
+  a->Deallocate(data_ptr_2, 4096 * sizeof(double));
+  a->Deallocate(data_ptr_1, 2048 * sizeof(float));
+}
+
+}  // namespace vm
+}  // namespace oneflow
+
+
+#endif  // WITH_ROCM
--- a/oneflow/core/vm/cuda_backend_allocator.cpp
+++ b/oneflow/core/vm/cuda_backend_allocator.cpp
@@ -50,3 +50,41 @@ void CudaBackendAllocator::DeviceReset() {
 }  // namespace oneflow

 #endif
+
+#ifdef WITH_ROCM
+
+#include "oneflow/core/vm/cuda_backend_allocator.h"
+#include "oneflow/core/device/cuda_util.h"
+#include <iostream>
+
+namespace oneflow {
+namespace vm {
+
+Maybe<void> CudaBackendAllocator::Allocate(char** mem_ptr, std::size_t size) {
+  hipSetDevice(device_id_);
+  if (hipMalloc(mem_ptr, size) != hipSuccess) {
+    *mem_ptr = nullptr;
+    return Error::OutOfMemoryError() << "cuda allocator out of memory";
+  }
+  return Maybe<void>::Ok();
+}
+
+void CudaBackendAllocator::Deallocate(char* mem_ptr, std::size_t size) {
+  hipSetDevice(device_id_);
+  OF_CUDA_CHECK(hipFree(mem_ptr));
+}
+
+void CudaBackendAllocator::DeviceReset() {
+  hipSetDevice(device_id_);
+  // NOTE(chengcheng): In some corner case on ubuntu, cuda memory not released even if OOM.
+  //   So there need release all cuda memory allocated by this process before core dump.
+  LOG(WARNING) << "OOM error is detected, process will exit. And it will start to reset CUDA "
+               << "device for releasing device memory.";
+  OF_CUDA_CHECK(hipDeviceReset());
+}
+
+}  // namespace vm
+}  // namespace oneflow
+
+#endif
+
--- a/oneflow/core/vm/cuda_host_allocator.cpp
+++ b/oneflow/core/vm/cuda_host_allocator.cpp
@@ -63,3 +63,54 @@ COMMAND(Singleton<CudaHostAllocator>::SetAllocated(new CudaHostAllocator(0)));
 }  // namespace oneflow

 #endif
+
+#ifdef WITH_ROCM
+
+#include "oneflow/core/vm/cuda_host_allocator.h"
+#include "oneflow/core/device/cuda_util.h"
+
+namespace oneflow {
+namespace vm {
+
+CudaHostAllocator::~CudaHostAllocator() {
+  CudaCurrentDeviceGuard guard(device_id_);
+  for (const auto& ptr_vec : granularity2free_ptrs_) {
+    for (char* ptr : ptr_vec) { OF_CUDA_CHECK(hipHostFree(ptr)); }
+  }
+  for (const auto& pair : occupied_ptr2granularity_) { OF_CUDA_CHECK(hipHostFree(pair.first)); }
+}
+
+Maybe<void> CudaHostAllocator::Allocate(char** mem_ptr, std::size_t size) {
+  std::size_t granularity = std::ceil(std::log2(size));
+  CHECK_GE_OR_RETURN(granularity, 0) << "out of range";
+  CHECK_LT_OR_RETURN(granularity, kCudaHostMaxGranularity) << "invalid granularity";
+  CHECK_LE_OR_RETURN(size, 1 << granularity) << "out of range";
+  CudaCurrentDeviceGuard guard(device_id_);
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto* vec = &granularity2free_ptrs_[granularity];
+  if (vec->empty()) {
+    char* ptr = nullptr;
+    OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&ptr), 1 << granularity));
+    vec->emplace_back(ptr);
+  }
+  *mem_ptr = vec->back();
+  vec->pop_back();
+  occupied_ptr2granularity_[*mem_ptr] = granularity;
+  return Maybe<void>::Ok();
+}
+
+void CudaHostAllocator::Deallocate(char* mem_ptr, std::size_t size) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto iter = occupied_ptr2granularity_.find(mem_ptr);
+  CHECK(iter != occupied_ptr2granularity_.end());
+  std::size_t granularity = iter->second;
+  occupied_ptr2granularity_.erase(iter);
+  granularity2free_ptrs_[granularity].emplace_back(mem_ptr);
+}
+
+COMMAND(Singleton<CudaHostAllocator>::SetAllocated(new CudaHostAllocator(0)));
+
+}  // namespace vm
+}  // namespace oneflow
+
+#endif
--- a/oneflow/core/vm/ep_backend_allocator.cpp
+++ b/oneflow/core/vm/ep_backend_allocator.cpp
@@ -39,6 +39,16 @@ void EpBackendAllocator::DeviceReset() {
    OF_CUDA_CHECK(cudaDeviceReset());
  }
 #endif
+#ifdef WITH_ROCM
+  if (ep_device_->device_type() == DeviceType::kCUDA) {
+    ep_device_->SetAsActiveDevice();
+    // NOTE(chengcheng): In some corner case on ubuntu, cuda memory not released even if OOM.
+    //   So there need release all cuda memory allocated by this process before core dump.
+    LOG(WARNING) << "OOM error is detected, process will exit. And it will start to reset CUDA "
+                 << "device for releasing device memory.";
+    OF_CUDA_CHECK(hipDeviceReset());
+  }
+#endif
 }

 }  // namespace vm