Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Oneflow
Commits
460b49f4
Commit
460b49f4
authored
Nov 02, 2022
by
yuguo
Browse files
trival files migration
parent
5e07668d
Changes
18
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
382 additions
and
1 deletion
+382
-1
oneflow/api/cpp/tests/graph_test.cpp
oneflow/api/cpp/tests/graph_test.cpp
+20
-0
oneflow/api/cpp/tests/tensor_test.cpp
oneflow/api/cpp/tests/tensor_test.cpp
+9
-0
oneflow/api/python/flags.cpp
oneflow/api/python/flags.cpp
+4
-1
oneflow/core/ep/test/primitive/fill_test.cpp
oneflow/core/ep/test/primitive/fill_test.cpp
+8
-0
oneflow/core/framework/multi_client_session_context.cpp
oneflow/core/framework/multi_client_session_context.cpp
+3
-0
oneflow/core/functional/impl/array_functor.cpp
oneflow/core/functional/impl/array_functor.cpp
+3
-0
oneflow/core/graph_impl/decode_h2d_compute_task_node.cpp
oneflow/core/graph_impl/decode_h2d_compute_task_node.cpp
+6
-0
oneflow/core/job/runtime.cpp
oneflow/core/job/runtime.cpp
+3
-0
oneflow/core/kernel/broadcast_to_compatible_with_kernel.cpp
oneflow/core/kernel/broadcast_to_compatible_with_kernel.cpp
+3
-0
oneflow/core/kernel/foreign_watch_kernel.cpp
oneflow/core/kernel/foreign_watch_kernel.cpp
+4
-0
oneflow/core/kernel/sync_dynamic_resize_kernel.cpp
oneflow/core/kernel/sync_dynamic_resize_kernel.cpp
+86
-0
oneflow/core/kernel/user_kernel.h
oneflow/core/kernel/user_kernel.h
+6
-0
oneflow/core/lazy/actor/light_actor.cpp
oneflow/core/lazy/actor/light_actor.cpp
+6
-0
oneflow/core/lazy/actor/naive_actor.cpp
oneflow/core/lazy/actor/naive_actor.cpp
+3
-0
oneflow/core/vm/bin_allocator_test.cpp
oneflow/core/vm/bin_allocator_test.cpp
+119
-0
oneflow/core/vm/cuda_backend_allocator.cpp
oneflow/core/vm/cuda_backend_allocator.cpp
+38
-0
oneflow/core/vm/cuda_host_allocator.cpp
oneflow/core/vm/cuda_host_allocator.cpp
+51
-0
oneflow/core/vm/ep_backend_allocator.cpp
oneflow/core/vm/ep_backend_allocator.cpp
+10
-0
No files found.
oneflow/api/cpp/tests/graph_test.cpp
View file @
460b49f4
...
...
@@ -84,6 +84,26 @@ TEST(Api, graph_multi_gpu_test) {
}
#endif
#ifdef WITH_ROCM
TEST
(
Api
,
graph_gpu_test
)
{
EnvScope
scope
;
Device
device
(
"cuda"
,
0
);
Graph
graph
=
LoadGraph
(
device
);
Forward
(
graph
,
device
);
}
TEST
(
Api
,
graph_multi_gpu_test
)
{
EnvScope
scope
;
Device
device
(
"cuda"
,
0
);
Graph
graph
=
LoadGraph
(
device
);
Forward
(
graph
,
device
);
Device
device1
(
"cuda"
,
1
);
Graph
graph1
=
LoadGraph
(
device1
);
Forward
(
graph1
,
device1
);
}
#endif
TEST
(
Api
,
graph_cpu_batching_test
)
{
EnvScope
scope
;
Device
device
(
"cpu"
);
...
...
oneflow/api/cpp/tests/tensor_test.cpp
View file @
460b49f4
...
...
@@ -34,6 +34,15 @@ TEST(Api, device) {
ASSERT_EQ
(
device
.
type
(),
"cuda"
);
ASSERT_EQ
(
device
.
device_id
(),
1
);
#endif
#ifdef WITH_ROCM
device
=
Device
(
"cuda:0"
);
ASSERT_EQ
(
device
.
type
(),
"cuda"
);
ASSERT_EQ
(
device
.
device_id
(),
0
);
device
=
Device
(
"cuda"
,
1
);
ASSERT_EQ
(
device
.
type
(),
"cuda"
);
ASSERT_EQ
(
device
.
device_id
(),
1
);
#endif
}
TEST
(
Api
,
tensor
)
{
...
...
oneflow/api/python/flags.cpp
View file @
460b49f4
...
...
@@ -17,12 +17,15 @@ limitations under the License.
#ifdef WITH_CUDA
#include <cuda.h>
#endif
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#endif
namespace
oneflow
{
ONEFLOW_API_PYBIND11_MODULE
(
"flags"
,
m
)
{
m
.
def
(
"with_cuda"
,
[]()
{
#ifdef
WITH_CUDA
#if
def
ined(
WITH_CUDA
) || defined(WITH_ROCM)
return
true
;
#else
return
false
;
...
...
oneflow/core/ep/test/primitive/fill_test.cpp
View file @
460b49f4
...
...
@@ -27,6 +27,11 @@ limitations under the License.
#endif // CUDA_VERSION >= 11000
#endif // WITH_CUDA
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#endif // WITH_ROCM
namespace
oneflow
{
namespace
ep
{
...
...
@@ -87,6 +92,9 @@ TEST_F(PrimitiveTest, TestFill) {
1024
);
#endif // CUDA_VERSION >= 11000
#endif // WITH_CUDA
// #ifdef WITH_ROCM
// TestFill<DataType::kFloat16, half>(&device_manager_registry_, available_device_types_, 1024);
// #endif // WITH_ROCM
TestFill
<
DataType
::
kBool
,
bool
>
(
&
device_manager_registry_
,
available_device_types_
,
1024
);
}
...
...
oneflow/core/framework/multi_client_session_context.cpp
View file @
460b49f4
...
...
@@ -41,6 +41,9 @@ limitations under the License.
#ifdef WITH_CUDA
#include <cuda.h>
#endif // WITH_CUDA
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#endif // WITH_ROCM
namespace
oneflow
{
...
...
oneflow/core/functional/impl/array_functor.cpp
View file @
460b49f4
...
...
@@ -1389,6 +1389,9 @@ class CopyFunctor {
#ifdef WITH_CUDA
if
(
device_type
==
"cuda"
)
{
InitCudaContextOnce
(
device_id
);
}
#endif
#ifdef WITH_ROCM
if
(
device_type
==
"cuda"
)
{
InitCudaContextOnce
(
device_id
);
}
#endif
return
OpInterpUtil
::
Dispatch
<
Tensor
>
(
*
op_
,
{
x
},
attrs
);
}
...
...
oneflow/core/graph_impl/decode_h2d_compute_task_node.cpp
View file @
460b49f4
...
...
@@ -62,6 +62,12 @@ REGISTER_NAMED_TASK_STREAM_INDEX_GETTER(DeviceType::kCUDA, TaskType::kDecodeH2D,
#endif
#ifdef WITH_ROCM
REGISTER_NAMED_TASK_STREAM_INDEX_GETTER
(
DeviceType
::
kCUDA
,
TaskType
::
kDecodeH2D
,
"DECODE_H2D"
)
#endif
namespace
{
CompTaskNode
*
CreateCompTaskNodeByOpDeviceType
(
const
OperatorConf
&
op_conf
)
{
...
...
oneflow/core/job/runtime.cpp
View file @
460b49f4
...
...
@@ -73,6 +73,9 @@ Runtime::Runtime(
#ifdef WITH_CUDA
Singleton
<
EagerNcclCommMgr
>::
Get
()
->
CreateCommFromPlan
(
plan
);
#endif // WITH_CUDA
#ifdef WITH_ROCM
Singleton
<
EagerNcclCommMgr
>::
Get
()
->
CreateCommFromPlan
(
plan
);
#endif // WITH_ROCM
}
std
::
vector
<
const
TaskProto
*>
source_tasks
;
source_tasks
.
reserve
(
plan
.
task
().
size
());
...
...
oneflow/core/kernel/broadcast_to_compatible_with_kernel.cpp
View file @
460b49f4
...
...
@@ -59,5 +59,8 @@ OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL,
#if defined(WITH_CUDA)
REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL
(
DeviceType
::
kCUDA
,
(
float16
,
DataType
::
kFloat16
))
#endif
#if defined(WITH_ROCM)
REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL
(
DeviceType
::
kCUDA
,
(
float16
,
DataType
::
kFloat16
))
#endif
}
// namespace oneflow
oneflow/core/kernel/foreign_watch_kernel.cpp
View file @
460b49f4
...
...
@@ -48,5 +48,9 @@ REGISTER_KERNEL_WITH_DEVICE(OperatorConf::kForeignWatchConf, DeviceType::kCPU,
REGISTER_KERNEL_WITH_DEVICE
(
OperatorConf
::
kForeignWatchConf
,
DeviceType
::
kCUDA
,
ForeignWatchKernel
<
DeviceType
::
kCUDA
>
);
#endif
#ifdef WITH_ROCM
REGISTER_KERNEL_WITH_DEVICE
(
OperatorConf
::
kForeignWatchConf
,
DeviceType
::
kCUDA
,
ForeignWatchKernel
<
DeviceType
::
kCUDA
>
);
#endif
}
// namespace oneflow
oneflow/core/kernel/sync_dynamic_resize_kernel.cpp
View file @
460b49f4
...
...
@@ -114,6 +114,92 @@ REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int64_t);
#endif // WITH_CUDA
#ifdef WITH_ROCM
namespace
{
class
CudaHostMem
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
CudaHostMem
);
CudaHostMem
(
const
size_t
size
)
{
OF_CUDA_CHECK
(
hipMallocHost
(
reinterpret_cast
<
void
**>
(
&
ptr_
),
size
));
}
~
CudaHostMem
()
{
OF_CUDA_CHECK
(
hipHostFree
(
ptr_
));
}
void
*
Ptr
()
const
{
return
ptr_
;
}
private:
void
*
ptr_
;
};
}
// namespace
template
<
typename
SizeType
>
class
SyncDynamicResizeGPUKernel
final
:
public
Kernel
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
SyncDynamicResizeGPUKernel
);
SyncDynamicResizeGPUKernel
()
=
default
;
~
SyncDynamicResizeGPUKernel
()
override
=
default
;
private:
bool
IsKernelLaunchSynchronized
()
const
override
{
return
false
;
}
void
ForwardDataContent
(
KernelContext
*
ctx
)
const
override
{
const
SyncDynamicResizeOpConf
&
conf
=
this
->
op_conf
().
sync_dynamic_resize_conf
();
CHECK_EQ
(
conf
.
axis
(),
0
);
std
::
shared_ptr
<
CudaHostMem
>
cuda_host_mem_ptr
;
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
queue_
.
empty
())
{
cuda_host_mem_ptr
.
reset
(
new
CudaHostMem
(
sizeof
(
SizeType
)));
}
else
{
cuda_host_mem_ptr
=
queue_
.
front
();
queue_
.
pop
();
}
}
const
Blob
*
in
=
ctx
->
BnInOp2Blob
(
"in"
);
const
Blob
*
size
=
ctx
->
BnInOp2Blob
(
"size"
);
Blob
*
out
=
ctx
->
BnInOp2Blob
(
"out"
);
AutoMemcpy
(
ctx
->
stream
(),
out
->
mut_dptr
(),
in
->
dptr
(),
in
->
ByteSizeOfBlobBody
(),
out
->
mem_case
(),
in
->
mem_case
());
AutoMemcpy
(
ctx
->
stream
(),
cuda_host_mem_ptr
->
Ptr
(),
size
->
dptr
(),
sizeof
(
SizeType
),
MakeHostMemCase
(),
size
->
mem_case
());
const
auto
&
UpdateShape
=
[
out
,
cuda_host_mem_ptr
,
conf
,
this
]()
{
const
int64_t
new_size
=
*
reinterpret_cast
<
SizeType
*>
(
cuda_host_mem_ptr
->
Ptr
());
CHECK_GE
(
new_size
,
0
);
CHECK_LE
(
new_size
,
out
->
shape_view
().
At
(
conf
.
axis
()));
// NOTE(Liang Depeng): `mut_shape_view` should be used here to get the blob's `MutShapeView`
// pointer. But this callback is called after `Kernel::Forward` function's
// execution and the header check is already been set to false at that
// moment. So we have to choose the `ForceMutShapeView` function with
// header checker disabled.
out
->
ForceMutShapeView
()
->
Set
(
conf
.
axis
(),
new_size
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
queue_
.
push
(
cuda_host_mem_ptr
);
};
if
(
conf
.
eager
())
{
CHECK_JUST
(
ctx
->
stream
()
->
Sync
());
UpdateShape
();
}
else
{
auto
*
actor_context_provider
=
CHECK_NOTNULL
(
dynamic_cast
<
ActorContextProvider
*>
(
ctx
));
actor_context_provider
->
GetActorContext
()
->
AddCallback
(
UpdateShape
);
}
}
mutable
std
::
queue
<
std
::
shared_ptr
<
CudaHostMem
>>
queue_
;
mutable
std
::
mutex
mutex_
;
};
#define REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(stype) \
NEW_REGISTER_KERNEL(OperatorConf::kSyncDynamicResizeConf, SyncDynamicResizeGPUKernel<stype>) \
.SetIsMatchedPred([](const KernelConf& kernel_conf) { \
return (kernel_conf.op_attribute().op_conf().device_tag() == "cuda" \
&& GetDataType<stype>::value \
== kernel_conf.sync_dynamic_resize_conf().size_data_type()); \
})
REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL
(
int8_t
);
REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL
(
int32_t
);
REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL
(
int64_t
);
#endif // WITH_ROCM
template
<
typename
SizeType
>
class
SyncDynamicResizeCPUKernel
final
:
public
Kernel
{
public:
...
...
oneflow/core/kernel/user_kernel.h
View file @
460b49f4
...
...
@@ -29,6 +29,12 @@ limitations under the License.
#endif // WITH_CUDA
#ifdef WITH_ROCM
#include "oneflow/core/ep/rocm/cuda_stream.h"
#endif // WITH_ROCM
namespace
oneflow
{
class
UserKernelComputeContext
;
...
...
oneflow/core/lazy/actor/light_actor.cpp
View file @
460b49f4
...
...
@@ -35,6 +35,12 @@ limitations under the License.
#endif // WITH_CUDA
#ifdef WITH_ROCM
#include "oneflow/core/ep/rocm/cuda_stream.h"
#endif // WITH_ROCM
namespace
oneflow
{
namespace
{
...
...
oneflow/core/lazy/actor/naive_actor.cpp
View file @
460b49f4
...
...
@@ -39,5 +39,8 @@ REGISTER_ACTOR(TaskType::kCriticalSectionWaitTick, NaiveActor);
#ifdef WITH_CUDA
REGISTER_ACTOR
(
TaskType
::
kCopyHd
,
NaiveActor
);
#endif
#ifdef WITH_ROCM
REGISTER_ACTOR
(
TaskType
::
kCopyHd
,
NaiveActor
);
#endif
REGISTER_ACTOR
(
TaskType
::
kCollectiveBoxingGeneric
,
NaiveActor
);
}
// namespace oneflow
oneflow/core/vm/bin_allocator_test.cpp
View file @
460b49f4
...
...
@@ -129,3 +129,122 @@ TEST(CudaBinAllocator, cuda_allocator) {
}
// namespace oneflow
#endif // WITH_CUDA
#ifdef WITH_ROCM
#include "gtest/gtest.h"
#include "oneflow/core/vm/bin_allocator.h"
#include "oneflow/core/vm/thread_safe_allocator.h"
#include "oneflow/core/device/cuda_util.h"
namespace
oneflow
{
namespace
vm
{
class
CudaBackendAllocator
final
:
public
Allocator
{
public:
explicit
CudaBackendAllocator
(
int64_t
device_id
)
:
device_id_
(
device_id
)
{}
~
CudaBackendAllocator
()
override
=
default
;
Maybe
<
void
>
Allocate
(
char
**
mem_ptr
,
std
::
size_t
size
)
override
;
void
Deallocate
(
char
*
mem_ptr
,
std
::
size_t
size
)
override
;
void
DeviceReset
()
override
;
private:
int64_t
device_id_
;
};
Maybe
<
void
>
CudaBackendAllocator
::
Allocate
(
char
**
mem_ptr
,
std
::
size_t
size
)
{
hipSetDevice
(
device_id_
);
if
(
hipMalloc
(
mem_ptr
,
size
)
!=
hipSuccess
)
{
*
mem_ptr
=
nullptr
;
}
return
Maybe
<
void
>::
Ok
();
}
void
CudaBackendAllocator
::
Deallocate
(
char
*
mem_ptr
,
std
::
size_t
size
)
{
hipSetDevice
(
device_id_
);
OF_CUDA_CHECK
(
hipFree
(
mem_ptr
));
}
void
CudaBackendAllocator
::
DeviceReset
()
{
hipSetDevice
(
device_id_
);
// NOTE(chengcheng): In some corner case on ubuntu, cuda memory not released even if OOM.
// So there need release all cuda memory allocated by this process before core dump.
LOG
(
WARNING
)
<<
"OOM error is detected, process will exit. And it will start to reset CUDA "
<<
"device for releasing device memory."
;
OF_CUDA_CHECK
(
hipDeviceReset
());
}
TEST
(
CudaBinAllocator
,
cuda_allocator
)
{
int
gpu_num
=
-
1
;
hipGetDeviceCount
(
&
gpu_num
);
if
(
gpu_num
<=
0
)
{
LOG
(
INFO
)
<<
"CudaBinAllocator Test: Skip because of non GPU device."
;
return
;
}
ASSERT_TRUE
(
hipSuccess
==
hipSetDevice
(
0
));
size_t
free_bytes
=
-
1
;
size_t
total_bytes
=
-
1
;
const
size_t
remain_bytes
=
50
*
1048576
;
ASSERT_TRUE
(
hipSuccess
==
hipMemGetInfo
(
&
free_bytes
,
&
total_bytes
));
if
(
free_bytes
<=
remain_bytes
||
free_bytes
-
remain_bytes
<
remain_bytes
)
{
LOG
(
INFO
)
<<
"CudaBinAllocator Test: Skip because of allocator mem bytes less than 50MiB in GPU 0"
;
return
;
}
std
::
unique_ptr
<
Allocator
>
allo
(
new
BinAllocator
(
kCudaMemAllocAlignSize
,
std
::
make_unique
<
CudaBackendAllocator
>
(
0
)));
allo
.
reset
(
new
SingleThreadOnlyAllocator
(
std
::
move
(
allo
)));
Allocator
*
a
=
allo
.
get
();
std
::
vector
<
char
*>
ptrs
;
for
(
int
i
=
0
;
i
<
512
;
++
i
)
{
char
*
ptr
=
nullptr
;
CHECK_JUST
(
a
->
Allocate
(
&
ptr
,
1
));
ASSERT_TRUE
(
ptr
!=
nullptr
);
ptrs
.
emplace_back
(
ptr
);
}
std
::
sort
(
ptrs
.
begin
(),
ptrs
.
end
());
for
(
int
i
=
0
;
i
<
512
;
++
i
)
{
if
(
i
>
0
)
{
ASSERT_TRUE
(
ptrs
.
at
(
i
)
!=
ptrs
.
at
(
i
-
1
));
ASSERT_TRUE
(
std
::
abs
(
ptrs
.
at
(
i
)
-
ptrs
.
at
(
i
-
1
))
>=
kCudaMemAllocAlignSize
);
}
a
->
Deallocate
(
ptrs
.
at
(
i
),
1
);
}
ptrs
.
clear
();
for
(
int
i
=
0
;
i
<
2048
;
++
i
)
{
char
*
ptr
=
nullptr
;
CHECK_JUST
(
a
->
Allocate
(
&
ptr
,
10000
));
ASSERT_TRUE
(
ptr
!=
nullptr
);
ptrs
.
emplace_back
(
ptr
);
}
std
::
sort
(
ptrs
.
begin
(),
ptrs
.
end
());
for
(
int
i
=
0
;
i
<
2048
;
++
i
)
{
if
(
i
>
0
)
{
ASSERT_TRUE
(
ptrs
.
at
(
i
)
!=
ptrs
.
at
(
i
-
1
));
ASSERT_TRUE
(
std
::
abs
(
ptrs
.
at
(
i
)
-
ptrs
.
at
(
i
-
1
))
>=
kCudaMemAllocAlignSize
);
}
a
->
Deallocate
(
ptrs
.
at
(
i
),
10000
);
}
char
*
data_ptr_1
=
nullptr
;
CHECK_JUST
(
a
->
Allocate
(
&
data_ptr_1
,
2048
*
sizeof
(
float
)));
char
*
data_ptr_2
=
nullptr
;
CHECK_JUST
(
a
->
Allocate
(
&
data_ptr_2
,
4096
*
sizeof
(
double
)));
ASSERT_TRUE
(
data_ptr_1
!=
data_ptr_2
);
if
(
data_ptr_1
<
data_ptr_2
)
{
ASSERT_TRUE
(
data_ptr_1
+
2048
*
sizeof
(
float
)
<=
data_ptr_2
);
}
else
{
ASSERT_TRUE
(
data_ptr_2
+
4096
*
sizeof
(
double
)
<=
data_ptr_1
);
}
a
->
Deallocate
(
data_ptr_2
,
4096
*
sizeof
(
double
));
a
->
Deallocate
(
data_ptr_1
,
2048
*
sizeof
(
float
));
}
}
// namespace vm
}
// namespace oneflow
#endif // WITH_ROCM
oneflow/core/vm/cuda_backend_allocator.cpp
View file @
460b49f4
...
...
@@ -50,3 +50,41 @@ void CudaBackendAllocator::DeviceReset() {
}
// namespace oneflow
#endif
#ifdef WITH_ROCM
#include "oneflow/core/vm/cuda_backend_allocator.h"
#include "oneflow/core/device/cuda_util.h"
#include <iostream>
namespace
oneflow
{
namespace
vm
{
Maybe
<
void
>
CudaBackendAllocator
::
Allocate
(
char
**
mem_ptr
,
std
::
size_t
size
)
{
hipSetDevice
(
device_id_
);
if
(
hipMalloc
(
mem_ptr
,
size
)
!=
hipSuccess
)
{
*
mem_ptr
=
nullptr
;
return
Error
::
OutOfMemoryError
()
<<
"cuda allocator out of memory"
;
}
return
Maybe
<
void
>::
Ok
();
}
void
CudaBackendAllocator
::
Deallocate
(
char
*
mem_ptr
,
std
::
size_t
size
)
{
hipSetDevice
(
device_id_
);
OF_CUDA_CHECK
(
hipFree
(
mem_ptr
));
}
void
CudaBackendAllocator
::
DeviceReset
()
{
hipSetDevice
(
device_id_
);
// NOTE(chengcheng): In some corner case on ubuntu, cuda memory not released even if OOM.
// So there need release all cuda memory allocated by this process before core dump.
LOG
(
WARNING
)
<<
"OOM error is detected, process will exit. And it will start to reset CUDA "
<<
"device for releasing device memory."
;
OF_CUDA_CHECK
(
hipDeviceReset
());
}
}
// namespace vm
}
// namespace oneflow
#endif
oneflow/core/vm/cuda_host_allocator.cpp
View file @
460b49f4
...
...
@@ -63,3 +63,54 @@ COMMAND(Singleton<CudaHostAllocator>::SetAllocated(new CudaHostAllocator(0)));
}
// namespace oneflow
#endif
#ifdef WITH_ROCM
#include "oneflow/core/vm/cuda_host_allocator.h"
#include "oneflow/core/device/cuda_util.h"
namespace
oneflow
{
namespace
vm
{
CudaHostAllocator
::~
CudaHostAllocator
()
{
CudaCurrentDeviceGuard
guard
(
device_id_
);
for
(
const
auto
&
ptr_vec
:
granularity2free_ptrs_
)
{
for
(
char
*
ptr
:
ptr_vec
)
{
OF_CUDA_CHECK
(
hipHostFree
(
ptr
));
}
}
for
(
const
auto
&
pair
:
occupied_ptr2granularity_
)
{
OF_CUDA_CHECK
(
hipHostFree
(
pair
.
first
));
}
}
Maybe
<
void
>
CudaHostAllocator
::
Allocate
(
char
**
mem_ptr
,
std
::
size_t
size
)
{
std
::
size_t
granularity
=
std
::
ceil
(
std
::
log2
(
size
));
CHECK_GE_OR_RETURN
(
granularity
,
0
)
<<
"out of range"
;
CHECK_LT_OR_RETURN
(
granularity
,
kCudaHostMaxGranularity
)
<<
"invalid granularity"
;
CHECK_LE_OR_RETURN
(
size
,
1
<<
granularity
)
<<
"out of range"
;
CudaCurrentDeviceGuard
guard
(
device_id_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
auto
*
vec
=
&
granularity2free_ptrs_
[
granularity
];
if
(
vec
->
empty
())
{
char
*
ptr
=
nullptr
;
OF_CUDA_CHECK
(
hipMallocHost
(
reinterpret_cast
<
void
**>
(
&
ptr
),
1
<<
granularity
));
vec
->
emplace_back
(
ptr
);
}
*
mem_ptr
=
vec
->
back
();
vec
->
pop_back
();
occupied_ptr2granularity_
[
*
mem_ptr
]
=
granularity
;
return
Maybe
<
void
>::
Ok
();
}
void
CudaHostAllocator
::
Deallocate
(
char
*
mem_ptr
,
std
::
size_t
size
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
auto
iter
=
occupied_ptr2granularity_
.
find
(
mem_ptr
);
CHECK
(
iter
!=
occupied_ptr2granularity_
.
end
());
std
::
size_t
granularity
=
iter
->
second
;
occupied_ptr2granularity_
.
erase
(
iter
);
granularity2free_ptrs_
[
granularity
].
emplace_back
(
mem_ptr
);
}
COMMAND
(
Singleton
<
CudaHostAllocator
>::
SetAllocated
(
new
CudaHostAllocator
(
0
)));
}
// namespace vm
}
// namespace oneflow
#endif
oneflow/core/vm/ep_backend_allocator.cpp
View file @
460b49f4
...
...
@@ -39,6 +39,16 @@ void EpBackendAllocator::DeviceReset() {
OF_CUDA_CHECK
(
cudaDeviceReset
());
}
#endif
#ifdef WITH_ROCM
if
(
ep_device_
->
device_type
()
==
DeviceType
::
kCUDA
)
{
ep_device_
->
SetAsActiveDevice
();
// NOTE(chengcheng): In some corner case on ubuntu, cuda memory not released even if OOM.
// So there need release all cuda memory allocated by this process before core dump.
LOG
(
WARNING
)
<<
"OOM error is detected, process will exit. And it will start to reset CUDA "
<<
"device for releasing device memory."
;
OF_CUDA_CHECK
(
hipDeviceReset
());
}
#endif
}
}
// namespace vm
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment