"torchvision/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "12bb88738a5b6b96767c4165282f644495780a80"
Commit 460b49f4 authored by yuguo's avatar yuguo
Browse files

trival files migration

parent 5e07668d
...@@ -84,6 +84,26 @@ TEST(Api, graph_multi_gpu_test) { ...@@ -84,6 +84,26 @@ TEST(Api, graph_multi_gpu_test) {
} }
#endif #endif
#ifdef WITH_ROCM
TEST(Api, graph_gpu_test) {
EnvScope scope;
Device device("cuda", 0);
Graph graph = LoadGraph(device);
Forward(graph, device);
}
TEST(Api, graph_multi_gpu_test) {
EnvScope scope;
Device device("cuda", 0);
Graph graph = LoadGraph(device);
Forward(graph, device);
Device device1("cuda", 1);
Graph graph1 = LoadGraph(device1);
Forward(graph1, device1);
}
#endif
TEST(Api, graph_cpu_batching_test) { TEST(Api, graph_cpu_batching_test) {
EnvScope scope; EnvScope scope;
Device device("cpu"); Device device("cpu");
......
...@@ -34,6 +34,15 @@ TEST(Api, device) { ...@@ -34,6 +34,15 @@ TEST(Api, device) {
ASSERT_EQ(device.type(), "cuda"); ASSERT_EQ(device.type(), "cuda");
ASSERT_EQ(device.device_id(), 1); ASSERT_EQ(device.device_id(), 1);
#endif #endif
#ifdef WITH_ROCM
device = Device("cuda:0");
ASSERT_EQ(device.type(), "cuda");
ASSERT_EQ(device.device_id(), 0);
device = Device("cuda", 1);
ASSERT_EQ(device.type(), "cuda");
ASSERT_EQ(device.device_id(), 1);
#endif
} }
TEST(Api, tensor) { TEST(Api, tensor) {
......
...@@ -17,12 +17,15 @@ limitations under the License. ...@@ -17,12 +17,15 @@ limitations under the License.
#ifdef WITH_CUDA #ifdef WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif #endif
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#endif
namespace oneflow { namespace oneflow {
ONEFLOW_API_PYBIND11_MODULE("flags", m) { ONEFLOW_API_PYBIND11_MODULE("flags", m) {
m.def("with_cuda", []() { m.def("with_cuda", []() {
#ifdef WITH_CUDA #if defined(WITH_CUDA) || defined(WITH_ROCM)
return true; return true;
#else #else
return false; return false;
......
...@@ -27,6 +27,11 @@ limitations under the License. ...@@ -27,6 +27,11 @@ limitations under the License.
#endif // CUDA_VERSION >= 11000 #endif // CUDA_VERSION >= 11000
#endif // WITH_CUDA #endif // WITH_CUDA
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#endif // WITH_ROCM
namespace oneflow { namespace oneflow {
namespace ep { namespace ep {
...@@ -87,6 +92,9 @@ TEST_F(PrimitiveTest, TestFill) { ...@@ -87,6 +92,9 @@ TEST_F(PrimitiveTest, TestFill) {
1024); 1024);
#endif // CUDA_VERSION >= 11000 #endif // CUDA_VERSION >= 11000
#endif // WITH_CUDA #endif // WITH_CUDA
// #ifdef WITH_ROCM
// TestFill<DataType::kFloat16, half>(&device_manager_registry_, available_device_types_, 1024);
// #endif // WITH_ROCM
TestFill<DataType::kBool, bool>(&device_manager_registry_, available_device_types_, 1024); TestFill<DataType::kBool, bool>(&device_manager_registry_, available_device_types_, 1024);
} }
......
...@@ -41,6 +41,9 @@ limitations under the License. ...@@ -41,6 +41,9 @@ limitations under the License.
#ifdef WITH_CUDA #ifdef WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif // WITH_CUDA #endif // WITH_CUDA
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#endif // WITH_ROCM
namespace oneflow { namespace oneflow {
......
...@@ -1389,6 +1389,9 @@ class CopyFunctor { ...@@ -1389,6 +1389,9 @@ class CopyFunctor {
#ifdef WITH_CUDA #ifdef WITH_CUDA
if (device_type == "cuda") { InitCudaContextOnce(device_id); } if (device_type == "cuda") { InitCudaContextOnce(device_id); }
#endif
#ifdef WITH_ROCM
if (device_type == "cuda") { InitCudaContextOnce(device_id); }
#endif #endif
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs); return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
} }
......
...@@ -62,6 +62,12 @@ REGISTER_NAMED_TASK_STREAM_INDEX_GETTER(DeviceType::kCUDA, TaskType::kDecodeH2D, ...@@ -62,6 +62,12 @@ REGISTER_NAMED_TASK_STREAM_INDEX_GETTER(DeviceType::kCUDA, TaskType::kDecodeH2D,
#endif #endif
#ifdef WITH_ROCM
REGISTER_NAMED_TASK_STREAM_INDEX_GETTER(DeviceType::kCUDA, TaskType::kDecodeH2D, "DECODE_H2D")
#endif
namespace { namespace {
CompTaskNode* CreateCompTaskNodeByOpDeviceType(const OperatorConf& op_conf) { CompTaskNode* CreateCompTaskNodeByOpDeviceType(const OperatorConf& op_conf) {
......
...@@ -73,6 +73,9 @@ Runtime::Runtime( ...@@ -73,6 +73,9 @@ Runtime::Runtime(
#ifdef WITH_CUDA #ifdef WITH_CUDA
Singleton<EagerNcclCommMgr>::Get()->CreateCommFromPlan(plan); Singleton<EagerNcclCommMgr>::Get()->CreateCommFromPlan(plan);
#endif // WITH_CUDA #endif // WITH_CUDA
#ifdef WITH_ROCM
Singleton<EagerNcclCommMgr>::Get()->CreateCommFromPlan(plan);
#endif // WITH_ROCM
} }
std::vector<const TaskProto*> source_tasks; std::vector<const TaskProto*> source_tasks;
source_tasks.reserve(plan.task().size()); source_tasks.reserve(plan.task().size());
......
...@@ -59,5 +59,8 @@ OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL, ...@@ -59,5 +59,8 @@ OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL,
#if defined(WITH_CUDA) #if defined(WITH_CUDA)
REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL(DeviceType::kCUDA, (float16, DataType::kFloat16)) REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL(DeviceType::kCUDA, (float16, DataType::kFloat16))
#endif #endif
#if defined(WITH_ROCM)
REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL(DeviceType::kCUDA, (float16, DataType::kFloat16))
#endif
} // namespace oneflow } // namespace oneflow
...@@ -48,5 +48,9 @@ REGISTER_KERNEL_WITH_DEVICE(OperatorConf::kForeignWatchConf, DeviceType::kCPU, ...@@ -48,5 +48,9 @@ REGISTER_KERNEL_WITH_DEVICE(OperatorConf::kForeignWatchConf, DeviceType::kCPU,
REGISTER_KERNEL_WITH_DEVICE(OperatorConf::kForeignWatchConf, DeviceType::kCUDA, REGISTER_KERNEL_WITH_DEVICE(OperatorConf::kForeignWatchConf, DeviceType::kCUDA,
ForeignWatchKernel<DeviceType::kCUDA>); ForeignWatchKernel<DeviceType::kCUDA>);
#endif #endif
#ifdef WITH_ROCM
REGISTER_KERNEL_WITH_DEVICE(OperatorConf::kForeignWatchConf, DeviceType::kCUDA,
ForeignWatchKernel<DeviceType::kCUDA>);
#endif
} // namespace oneflow } // namespace oneflow
...@@ -114,6 +114,92 @@ REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int64_t); ...@@ -114,6 +114,92 @@ REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int64_t);
#endif // WITH_CUDA #endif // WITH_CUDA
#ifdef WITH_ROCM
namespace {
class CudaHostMem {
public:
OF_DISALLOW_COPY_AND_MOVE(CudaHostMem);
CudaHostMem(const size_t size) { OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&ptr_), size)); }
~CudaHostMem() { OF_CUDA_CHECK(hipHostFree(ptr_)); }
void* Ptr() const { return ptr_; }
private:
void* ptr_;
};
} // namespace
template<typename SizeType>
class SyncDynamicResizeGPUKernel final : public Kernel {
public:
OF_DISALLOW_COPY_AND_MOVE(SyncDynamicResizeGPUKernel);
SyncDynamicResizeGPUKernel() = default;
~SyncDynamicResizeGPUKernel() override = default;
private:
bool IsKernelLaunchSynchronized() const override { return false; }
void ForwardDataContent(KernelContext* ctx) const override {
const SyncDynamicResizeOpConf& conf = this->op_conf().sync_dynamic_resize_conf();
CHECK_EQ(conf.axis(), 0);
std::shared_ptr<CudaHostMem> cuda_host_mem_ptr;
{
std::lock_guard<std::mutex> lock(mutex_);
if (queue_.empty()) {
cuda_host_mem_ptr.reset(new CudaHostMem(sizeof(SizeType)));
} else {
cuda_host_mem_ptr = queue_.front();
queue_.pop();
}
}
const Blob* in = ctx->BnInOp2Blob("in");
const Blob* size = ctx->BnInOp2Blob("size");
Blob* out = ctx->BnInOp2Blob("out");
AutoMemcpy(ctx->stream(), out->mut_dptr(), in->dptr(), in->ByteSizeOfBlobBody(),
out->mem_case(), in->mem_case());
AutoMemcpy(ctx->stream(), cuda_host_mem_ptr->Ptr(), size->dptr(), sizeof(SizeType),
MakeHostMemCase(), size->mem_case());
const auto& UpdateShape = [out, cuda_host_mem_ptr, conf, this]() {
const int64_t new_size = *reinterpret_cast<SizeType*>(cuda_host_mem_ptr->Ptr());
CHECK_GE(new_size, 0);
CHECK_LE(new_size, out->shape_view().At(conf.axis()));
// NOTE(Liang Depeng): `mut_shape_view` should be used here to get the blob's `MutShapeView`
// pointer. But this callback is called after `Kernel::Forward` function's
// execution and the header check is already been set to false at that
// moment. So we have to choose the `ForceMutShapeView` function with
// header checker disabled.
out->ForceMutShapeView()->Set(conf.axis(), new_size);
std::lock_guard<std::mutex> lock(mutex_);
queue_.push(cuda_host_mem_ptr);
};
if (conf.eager()) {
CHECK_JUST(ctx->stream()->Sync());
UpdateShape();
} else {
auto* actor_context_provider = CHECK_NOTNULL(dynamic_cast<ActorContextProvider*>(ctx));
actor_context_provider->GetActorContext()->AddCallback(UpdateShape);
}
}
mutable std::queue<std::shared_ptr<CudaHostMem>> queue_;
mutable std::mutex mutex_;
};
#define REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(stype) \
NEW_REGISTER_KERNEL(OperatorConf::kSyncDynamicResizeConf, SyncDynamicResizeGPUKernel<stype>) \
.SetIsMatchedPred([](const KernelConf& kernel_conf) { \
return (kernel_conf.op_attribute().op_conf().device_tag() == "cuda" \
&& GetDataType<stype>::value \
== kernel_conf.sync_dynamic_resize_conf().size_data_type()); \
})
REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int8_t);
REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int32_t);
REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int64_t);
#endif // WITH_ROCM
template<typename SizeType> template<typename SizeType>
class SyncDynamicResizeCPUKernel final : public Kernel { class SyncDynamicResizeCPUKernel final : public Kernel {
public: public:
......
...@@ -29,6 +29,12 @@ limitations under the License. ...@@ -29,6 +29,12 @@ limitations under the License.
#endif // WITH_CUDA #endif // WITH_CUDA
#ifdef WITH_ROCM
#include "oneflow/core/ep/rocm/cuda_stream.h"
#endif // WITH_ROCM
namespace oneflow { namespace oneflow {
class UserKernelComputeContext; class UserKernelComputeContext;
......
...@@ -35,6 +35,12 @@ limitations under the License. ...@@ -35,6 +35,12 @@ limitations under the License.
#endif // WITH_CUDA #endif // WITH_CUDA
#ifdef WITH_ROCM
#include "oneflow/core/ep/rocm/cuda_stream.h"
#endif // WITH_ROCM
namespace oneflow { namespace oneflow {
namespace { namespace {
......
...@@ -39,5 +39,8 @@ REGISTER_ACTOR(TaskType::kCriticalSectionWaitTick, NaiveActor); ...@@ -39,5 +39,8 @@ REGISTER_ACTOR(TaskType::kCriticalSectionWaitTick, NaiveActor);
#ifdef WITH_CUDA #ifdef WITH_CUDA
REGISTER_ACTOR(TaskType::kCopyHd, NaiveActor); REGISTER_ACTOR(TaskType::kCopyHd, NaiveActor);
#endif #endif
#ifdef WITH_ROCM
REGISTER_ACTOR(TaskType::kCopyHd, NaiveActor);
#endif
REGISTER_ACTOR(TaskType::kCollectiveBoxingGeneric, NaiveActor); REGISTER_ACTOR(TaskType::kCollectiveBoxingGeneric, NaiveActor);
} // namespace oneflow } // namespace oneflow
...@@ -129,3 +129,122 @@ TEST(CudaBinAllocator, cuda_allocator) { ...@@ -129,3 +129,122 @@ TEST(CudaBinAllocator, cuda_allocator) {
} // namespace oneflow } // namespace oneflow
#endif // WITH_CUDA #endif // WITH_CUDA
#ifdef WITH_ROCM
#include "gtest/gtest.h"
#include "oneflow/core/vm/bin_allocator.h"
#include "oneflow/core/vm/thread_safe_allocator.h"
#include "oneflow/core/device/cuda_util.h"
namespace oneflow {
namespace vm {
class CudaBackendAllocator final : public Allocator {
public:
explicit CudaBackendAllocator(int64_t device_id) : device_id_(device_id) {}
~CudaBackendAllocator() override = default;
Maybe<void> Allocate(char** mem_ptr, std::size_t size) override;
void Deallocate(char* mem_ptr, std::size_t size) override;
void DeviceReset() override;
private:
int64_t device_id_;
};
Maybe<void> CudaBackendAllocator::Allocate(char** mem_ptr, std::size_t size) {
hipSetDevice(device_id_);
if (hipMalloc(mem_ptr, size) != hipSuccess) { *mem_ptr = nullptr; }
return Maybe<void>::Ok();
}
void CudaBackendAllocator::Deallocate(char* mem_ptr, std::size_t size) {
hipSetDevice(device_id_);
OF_CUDA_CHECK(hipFree(mem_ptr));
}
void CudaBackendAllocator::DeviceReset() {
hipSetDevice(device_id_);
// NOTE(chengcheng): In some corner case on ubuntu, cuda memory not released even if OOM.
// So there need release all cuda memory allocated by this process before core dump.
LOG(WARNING) << "OOM error is detected, process will exit. And it will start to reset CUDA "
<< "device for releasing device memory.";
OF_CUDA_CHECK(hipDeviceReset());
}
TEST(CudaBinAllocator, cuda_allocator) {
int gpu_num = -1;
hipGetDeviceCount(&gpu_num);
if (gpu_num <= 0) {
LOG(INFO) << "CudaBinAllocator Test: Skip because of non GPU device.";
return;
}
ASSERT_TRUE(hipSuccess == hipSetDevice(0));
size_t free_bytes = -1;
size_t total_bytes = -1;
const size_t remain_bytes = 50 * 1048576;
ASSERT_TRUE(hipSuccess == hipMemGetInfo(&free_bytes, &total_bytes));
if (free_bytes <= remain_bytes || free_bytes - remain_bytes < remain_bytes) {
LOG(INFO)
<< "CudaBinAllocator Test: Skip because of allocator mem bytes less than 50MiB in GPU 0";
return;
}
std::unique_ptr<Allocator> allo(
new BinAllocator(kCudaMemAllocAlignSize, std::make_unique<CudaBackendAllocator>(0)));
allo.reset(new SingleThreadOnlyAllocator(std::move(allo)));
Allocator* a = allo.get();
std::vector<char*> ptrs;
for (int i = 0; i < 512; ++i) {
char* ptr = nullptr;
CHECK_JUST(a->Allocate(&ptr, 1));
ASSERT_TRUE(ptr != nullptr);
ptrs.emplace_back(ptr);
}
std::sort(ptrs.begin(), ptrs.end());
for (int i = 0; i < 512; ++i) {
if (i > 0) {
ASSERT_TRUE(ptrs.at(i) != ptrs.at(i - 1));
ASSERT_TRUE(std::abs(ptrs.at(i) - ptrs.at(i - 1)) >= kCudaMemAllocAlignSize);
}
a->Deallocate(ptrs.at(i), 1);
}
ptrs.clear();
for (int i = 0; i < 2048; ++i) {
char* ptr = nullptr;
CHECK_JUST(a->Allocate(&ptr, 10000));
ASSERT_TRUE(ptr != nullptr);
ptrs.emplace_back(ptr);
}
std::sort(ptrs.begin(), ptrs.end());
for (int i = 0; i < 2048; ++i) {
if (i > 0) {
ASSERT_TRUE(ptrs.at(i) != ptrs.at(i - 1));
ASSERT_TRUE(std::abs(ptrs.at(i) - ptrs.at(i - 1)) >= kCudaMemAllocAlignSize);
}
a->Deallocate(ptrs.at(i), 10000);
}
char* data_ptr_1 = nullptr;
CHECK_JUST(a->Allocate(&data_ptr_1, 2048 * sizeof(float)));
char* data_ptr_2 = nullptr;
CHECK_JUST(a->Allocate(&data_ptr_2, 4096 * sizeof(double)));
ASSERT_TRUE(data_ptr_1 != data_ptr_2);
if (data_ptr_1 < data_ptr_2) {
ASSERT_TRUE(data_ptr_1 + 2048 * sizeof(float) <= data_ptr_2);
} else {
ASSERT_TRUE(data_ptr_2 + 4096 * sizeof(double) <= data_ptr_1);
}
a->Deallocate(data_ptr_2, 4096 * sizeof(double));
a->Deallocate(data_ptr_1, 2048 * sizeof(float));
}
} // namespace vm
} // namespace oneflow
#endif // WITH_ROCM
...@@ -50,3 +50,41 @@ void CudaBackendAllocator::DeviceReset() { ...@@ -50,3 +50,41 @@ void CudaBackendAllocator::DeviceReset() {
} // namespace oneflow } // namespace oneflow
#endif #endif
#ifdef WITH_ROCM
#include "oneflow/core/vm/cuda_backend_allocator.h"
#include "oneflow/core/device/cuda_util.h"
#include <iostream>
namespace oneflow {
namespace vm {
Maybe<void> CudaBackendAllocator::Allocate(char** mem_ptr, std::size_t size) {
hipSetDevice(device_id_);
if (hipMalloc(mem_ptr, size) != hipSuccess) {
*mem_ptr = nullptr;
return Error::OutOfMemoryError() << "cuda allocator out of memory";
}
return Maybe<void>::Ok();
}
void CudaBackendAllocator::Deallocate(char* mem_ptr, std::size_t size) {
hipSetDevice(device_id_);
OF_CUDA_CHECK(hipFree(mem_ptr));
}
void CudaBackendAllocator::DeviceReset() {
hipSetDevice(device_id_);
// NOTE(chengcheng): In some corner case on ubuntu, cuda memory not released even if OOM.
// So there need release all cuda memory allocated by this process before core dump.
LOG(WARNING) << "OOM error is detected, process will exit. And it will start to reset CUDA "
<< "device for releasing device memory.";
OF_CUDA_CHECK(hipDeviceReset());
}
} // namespace vm
} // namespace oneflow
#endif
...@@ -63,3 +63,54 @@ COMMAND(Singleton<CudaHostAllocator>::SetAllocated(new CudaHostAllocator(0))); ...@@ -63,3 +63,54 @@ COMMAND(Singleton<CudaHostAllocator>::SetAllocated(new CudaHostAllocator(0)));
} // namespace oneflow } // namespace oneflow
#endif #endif
#ifdef WITH_ROCM
#include "oneflow/core/vm/cuda_host_allocator.h"
#include "oneflow/core/device/cuda_util.h"
namespace oneflow {
namespace vm {
CudaHostAllocator::~CudaHostAllocator() {
CudaCurrentDeviceGuard guard(device_id_);
for (const auto& ptr_vec : granularity2free_ptrs_) {
for (char* ptr : ptr_vec) { OF_CUDA_CHECK(hipHostFree(ptr)); }
}
for (const auto& pair : occupied_ptr2granularity_) { OF_CUDA_CHECK(hipHostFree(pair.first)); }
}
Maybe<void> CudaHostAllocator::Allocate(char** mem_ptr, std::size_t size) {
std::size_t granularity = std::ceil(std::log2(size));
CHECK_GE_OR_RETURN(granularity, 0) << "out of range";
CHECK_LT_OR_RETURN(granularity, kCudaHostMaxGranularity) << "invalid granularity";
CHECK_LE_OR_RETURN(size, 1 << granularity) << "out of range";
CudaCurrentDeviceGuard guard(device_id_);
std::unique_lock<std::mutex> lock(mutex_);
auto* vec = &granularity2free_ptrs_[granularity];
if (vec->empty()) {
char* ptr = nullptr;
OF_CUDA_CHECK(hipMallocHost(reinterpret_cast<void **>(&ptr), 1 << granularity));
vec->emplace_back(ptr);
}
*mem_ptr = vec->back();
vec->pop_back();
occupied_ptr2granularity_[*mem_ptr] = granularity;
return Maybe<void>::Ok();
}
void CudaHostAllocator::Deallocate(char* mem_ptr, std::size_t size) {
std::unique_lock<std::mutex> lock(mutex_);
auto iter = occupied_ptr2granularity_.find(mem_ptr);
CHECK(iter != occupied_ptr2granularity_.end());
std::size_t granularity = iter->second;
occupied_ptr2granularity_.erase(iter);
granularity2free_ptrs_[granularity].emplace_back(mem_ptr);
}
COMMAND(Singleton<CudaHostAllocator>::SetAllocated(new CudaHostAllocator(0)));
} // namespace vm
} // namespace oneflow
#endif
...@@ -39,6 +39,16 @@ void EpBackendAllocator::DeviceReset() { ...@@ -39,6 +39,16 @@ void EpBackendAllocator::DeviceReset() {
OF_CUDA_CHECK(cudaDeviceReset()); OF_CUDA_CHECK(cudaDeviceReset());
} }
#endif #endif
#ifdef WITH_ROCM
if (ep_device_->device_type() == DeviceType::kCUDA) {
ep_device_->SetAsActiveDevice();
// NOTE(chengcheng): In some corner case on ubuntu, cuda memory not released even if OOM.
// So there need release all cuda memory allocated by this process before core dump.
LOG(WARNING) << "OOM error is detected, process will exit. And it will start to reset CUDA "
<< "device for releasing device memory.";
OF_CUDA_CHECK(hipDeviceReset());
}
#endif
} }
} // namespace vm } // namespace vm
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment