Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Oneflow
Commits
460b49f4
"torchvision/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "12bb88738a5b6b96767c4165282f644495780a80"
Commit
460b49f4
authored
Nov 02, 2022
by
yuguo
Browse files
trival files migration
parent
5e07668d
Changes
18
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
382 additions
and
1 deletion
+382
-1
oneflow/api/cpp/tests/graph_test.cpp
oneflow/api/cpp/tests/graph_test.cpp
+20
-0
oneflow/api/cpp/tests/tensor_test.cpp
oneflow/api/cpp/tests/tensor_test.cpp
+9
-0
oneflow/api/python/flags.cpp
oneflow/api/python/flags.cpp
+4
-1
oneflow/core/ep/test/primitive/fill_test.cpp
oneflow/core/ep/test/primitive/fill_test.cpp
+8
-0
oneflow/core/framework/multi_client_session_context.cpp
oneflow/core/framework/multi_client_session_context.cpp
+3
-0
oneflow/core/functional/impl/array_functor.cpp
oneflow/core/functional/impl/array_functor.cpp
+3
-0
oneflow/core/graph_impl/decode_h2d_compute_task_node.cpp
oneflow/core/graph_impl/decode_h2d_compute_task_node.cpp
+6
-0
oneflow/core/job/runtime.cpp
oneflow/core/job/runtime.cpp
+3
-0
oneflow/core/kernel/broadcast_to_compatible_with_kernel.cpp
oneflow/core/kernel/broadcast_to_compatible_with_kernel.cpp
+3
-0
oneflow/core/kernel/foreign_watch_kernel.cpp
oneflow/core/kernel/foreign_watch_kernel.cpp
+4
-0
oneflow/core/kernel/sync_dynamic_resize_kernel.cpp
oneflow/core/kernel/sync_dynamic_resize_kernel.cpp
+86
-0
oneflow/core/kernel/user_kernel.h
oneflow/core/kernel/user_kernel.h
+6
-0
oneflow/core/lazy/actor/light_actor.cpp
oneflow/core/lazy/actor/light_actor.cpp
+6
-0
oneflow/core/lazy/actor/naive_actor.cpp
oneflow/core/lazy/actor/naive_actor.cpp
+3
-0
oneflow/core/vm/bin_allocator_test.cpp
oneflow/core/vm/bin_allocator_test.cpp
+119
-0
oneflow/core/vm/cuda_backend_allocator.cpp
oneflow/core/vm/cuda_backend_allocator.cpp
+38
-0
oneflow/core/vm/cuda_host_allocator.cpp
oneflow/core/vm/cuda_host_allocator.cpp
+51
-0
oneflow/core/vm/ep_backend_allocator.cpp
oneflow/core/vm/ep_backend_allocator.cpp
+10
-0
No files found.
oneflow/api/cpp/tests/graph_test.cpp
View file @
460b49f4
...
@@ -84,6 +84,26 @@ TEST(Api, graph_multi_gpu_test) {
...
@@ -84,6 +84,26 @@ TEST(Api, graph_multi_gpu_test) {
}
}
#endif
#endif
#ifdef WITH_ROCM
TEST
(
Api
,
graph_gpu_test
)
{
EnvScope
scope
;
Device
device
(
"cuda"
,
0
);
Graph
graph
=
LoadGraph
(
device
);
Forward
(
graph
,
device
);
}
TEST
(
Api
,
graph_multi_gpu_test
)
{
EnvScope
scope
;
Device
device
(
"cuda"
,
0
);
Graph
graph
=
LoadGraph
(
device
);
Forward
(
graph
,
device
);
Device
device1
(
"cuda"
,
1
);
Graph
graph1
=
LoadGraph
(
device1
);
Forward
(
graph1
,
device1
);
}
#endif
TEST
(
Api
,
graph_cpu_batching_test
)
{
TEST
(
Api
,
graph_cpu_batching_test
)
{
EnvScope
scope
;
EnvScope
scope
;
Device
device
(
"cpu"
);
Device
device
(
"cpu"
);
...
...
oneflow/api/cpp/tests/tensor_test.cpp
View file @
460b49f4
...
@@ -34,6 +34,15 @@ TEST(Api, device) {
...
@@ -34,6 +34,15 @@ TEST(Api, device) {
ASSERT_EQ
(
device
.
type
(),
"cuda"
);
ASSERT_EQ
(
device
.
type
(),
"cuda"
);
ASSERT_EQ
(
device
.
device_id
(),
1
);
ASSERT_EQ
(
device
.
device_id
(),
1
);
#endif
#endif
#ifdef WITH_ROCM
device
=
Device
(
"cuda:0"
);
ASSERT_EQ
(
device
.
type
(),
"cuda"
);
ASSERT_EQ
(
device
.
device_id
(),
0
);
device
=
Device
(
"cuda"
,
1
);
ASSERT_EQ
(
device
.
type
(),
"cuda"
);
ASSERT_EQ
(
device
.
device_id
(),
1
);
#endif
}
}
TEST
(
Api
,
tensor
)
{
TEST
(
Api
,
tensor
)
{
...
...
oneflow/api/python/flags.cpp
View file @
460b49f4
...
@@ -17,12 +17,15 @@ limitations under the License.
...
@@ -17,12 +17,15 @@ limitations under the License.
#ifdef WITH_CUDA
#ifdef WITH_CUDA
#include <cuda.h>
#include <cuda.h>
#endif
#endif
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#endif
namespace
oneflow
{
namespace
oneflow
{
ONEFLOW_API_PYBIND11_MODULE
(
"flags"
,
m
)
{
ONEFLOW_API_PYBIND11_MODULE
(
"flags"
,
m
)
{
m
.
def
(
"with_cuda"
,
[]()
{
m
.
def
(
"with_cuda"
,
[]()
{
#ifdef
WITH_CUDA
#if
def
ined(
WITH_CUDA
) || defined(WITH_ROCM)
return
true
;
return
true
;
#else
#else
return
false
;
return
false
;
...
...
oneflow/core/ep/test/primitive/fill_test.cpp
View file @
460b49f4
...
@@ -27,6 +27,11 @@ limitations under the License.
...
@@ -27,6 +27,11 @@ limitations under the License.
#endif // CUDA_VERSION >= 11000
#endif // CUDA_VERSION >= 11000
#endif // WITH_CUDA
#endif // WITH_CUDA
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#endif // WITH_ROCM
namespace
oneflow
{
namespace
oneflow
{
namespace
ep
{
namespace
ep
{
...
@@ -87,6 +92,9 @@ TEST_F(PrimitiveTest, TestFill) {
...
@@ -87,6 +92,9 @@ TEST_F(PrimitiveTest, TestFill) {
1024
);
1024
);
#endif // CUDA_VERSION >= 11000
#endif // CUDA_VERSION >= 11000
#endif // WITH_CUDA
#endif // WITH_CUDA
// #ifdef WITH_ROCM
// TestFill<DataType::kFloat16, half>(&device_manager_registry_, available_device_types_, 1024);
// #endif // WITH_ROCM
TestFill
<
DataType
::
kBool
,
bool
>
(
&
device_manager_registry_
,
available_device_types_
,
1024
);
TestFill
<
DataType
::
kBool
,
bool
>
(
&
device_manager_registry_
,
available_device_types_
,
1024
);
}
}
...
...
oneflow/core/framework/multi_client_session_context.cpp
View file @
460b49f4
...
@@ -41,6 +41,9 @@ limitations under the License.
...
@@ -41,6 +41,9 @@ limitations under the License.
#ifdef WITH_CUDA
#ifdef WITH_CUDA
#include <cuda.h>
#include <cuda.h>
#endif // WITH_CUDA
#endif // WITH_CUDA
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#endif // WITH_ROCM
namespace
oneflow
{
namespace
oneflow
{
...
...
oneflow/core/functional/impl/array_functor.cpp
View file @
460b49f4
...
@@ -1389,6 +1389,9 @@ class CopyFunctor {
...
@@ -1389,6 +1389,9 @@ class CopyFunctor {
#ifdef WITH_CUDA
#ifdef WITH_CUDA
if
(
device_type
==
"cuda"
)
{
InitCudaContextOnce
(
device_id
);
}
if
(
device_type
==
"cuda"
)
{
InitCudaContextOnce
(
device_id
);
}
#endif
#ifdef WITH_ROCM
if
(
device_type
==
"cuda"
)
{
InitCudaContextOnce
(
device_id
);
}
#endif
#endif
return
OpInterpUtil
::
Dispatch
<
Tensor
>
(
*
op_
,
{
x
},
attrs
);
return
OpInterpUtil
::
Dispatch
<
Tensor
>
(
*
op_
,
{
x
},
attrs
);
}
}
...
...
oneflow/core/graph_impl/decode_h2d_compute_task_node.cpp
View file @
460b49f4
...
@@ -62,6 +62,12 @@ REGISTER_NAMED_TASK_STREAM_INDEX_GETTER(DeviceType::kCUDA, TaskType::kDecodeH2D,
...
@@ -62,6 +62,12 @@ REGISTER_NAMED_TASK_STREAM_INDEX_GETTER(DeviceType::kCUDA, TaskType::kDecodeH2D,
#endif
#endif
#ifdef WITH_ROCM
REGISTER_NAMED_TASK_STREAM_INDEX_GETTER
(
DeviceType
::
kCUDA
,
TaskType
::
kDecodeH2D
,
"DECODE_H2D"
)
#endif
namespace
{
namespace
{
CompTaskNode
*
CreateCompTaskNodeByOpDeviceType
(
const
OperatorConf
&
op_conf
)
{
CompTaskNode
*
CreateCompTaskNodeByOpDeviceType
(
const
OperatorConf
&
op_conf
)
{
...
...
oneflow/core/job/runtime.cpp
View file @
460b49f4
...
@@ -73,6 +73,9 @@ Runtime::Runtime(
...
@@ -73,6 +73,9 @@ Runtime::Runtime(
#ifdef WITH_CUDA
#ifdef WITH_CUDA
Singleton
<
EagerNcclCommMgr
>::
Get
()
->
CreateCommFromPlan
(
plan
);
Singleton
<
EagerNcclCommMgr
>::
Get
()
->
CreateCommFromPlan
(
plan
);
#endif // WITH_CUDA
#endif // WITH_CUDA
#ifdef WITH_ROCM
Singleton
<
EagerNcclCommMgr
>::
Get
()
->
CreateCommFromPlan
(
plan
);
#endif // WITH_ROCM
}
}
std
::
vector
<
const
TaskProto
*>
source_tasks
;
std
::
vector
<
const
TaskProto
*>
source_tasks
;
source_tasks
.
reserve
(
plan
.
task
().
size
());
source_tasks
.
reserve
(
plan
.
task
().
size
());
...
...
oneflow/core/kernel/broadcast_to_compatible_with_kernel.cpp
View file @
460b49f4
...
@@ -59,5 +59,8 @@ OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL,
...
@@ -59,5 +59,8 @@ OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL,
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL
(
DeviceType
::
kCUDA
,
(
float16
,
DataType
::
kFloat16
))
REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL
(
DeviceType
::
kCUDA
,
(
float16
,
DataType
::
kFloat16
))
#endif
#endif
#if defined(WITH_ROCM)
REGISTTER_BROADCAST_TO_COMPATIBLE_WITH_KERNEL
(
DeviceType
::
kCUDA
,
(
float16
,
DataType
::
kFloat16
))
#endif
}
// namespace oneflow
}
// namespace oneflow
oneflow/core/kernel/foreign_watch_kernel.cpp
View file @
460b49f4
...
@@ -48,5 +48,9 @@ REGISTER_KERNEL_WITH_DEVICE(OperatorConf::kForeignWatchConf, DeviceType::kCPU,
...
@@ -48,5 +48,9 @@ REGISTER_KERNEL_WITH_DEVICE(OperatorConf::kForeignWatchConf, DeviceType::kCPU,
REGISTER_KERNEL_WITH_DEVICE
(
OperatorConf
::
kForeignWatchConf
,
DeviceType
::
kCUDA
,
REGISTER_KERNEL_WITH_DEVICE
(
OperatorConf
::
kForeignWatchConf
,
DeviceType
::
kCUDA
,
ForeignWatchKernel
<
DeviceType
::
kCUDA
>
);
ForeignWatchKernel
<
DeviceType
::
kCUDA
>
);
#endif
#endif
#ifdef WITH_ROCM
REGISTER_KERNEL_WITH_DEVICE
(
OperatorConf
::
kForeignWatchConf
,
DeviceType
::
kCUDA
,
ForeignWatchKernel
<
DeviceType
::
kCUDA
>
);
#endif
}
// namespace oneflow
}
// namespace oneflow
oneflow/core/kernel/sync_dynamic_resize_kernel.cpp
View file @
460b49f4
...
@@ -114,6 +114,92 @@ REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int64_t);
...
@@ -114,6 +114,92 @@ REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(int64_t);
#endif // WITH_CUDA
#endif // WITH_CUDA
#ifdef WITH_ROCM
namespace
{
class
CudaHostMem
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
CudaHostMem
);
CudaHostMem
(
const
size_t
size
)
{
OF_CUDA_CHECK
(
hipMallocHost
(
reinterpret_cast
<
void
**>
(
&
ptr_
),
size
));
}
~
CudaHostMem
()
{
OF_CUDA_CHECK
(
hipHostFree
(
ptr_
));
}
void
*
Ptr
()
const
{
return
ptr_
;
}
private:
void
*
ptr_
;
};
}
// namespace
template
<
typename
SizeType
>
class
SyncDynamicResizeGPUKernel
final
:
public
Kernel
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
SyncDynamicResizeGPUKernel
);
SyncDynamicResizeGPUKernel
()
=
default
;
~
SyncDynamicResizeGPUKernel
()
override
=
default
;
private:
bool
IsKernelLaunchSynchronized
()
const
override
{
return
false
;
}
void
ForwardDataContent
(
KernelContext
*
ctx
)
const
override
{
const
SyncDynamicResizeOpConf
&
conf
=
this
->
op_conf
().
sync_dynamic_resize_conf
();
CHECK_EQ
(
conf
.
axis
(),
0
);
std
::
shared_ptr
<
CudaHostMem
>
cuda_host_mem_ptr
;
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
queue_
.
empty
())
{
cuda_host_mem_ptr
.
reset
(
new
CudaHostMem
(
sizeof
(
SizeType
)));
}
else
{
cuda_host_mem_ptr
=
queue_
.
front
();
queue_
.
pop
();
}
}
const
Blob
*
in
=
ctx
->
BnInOp2Blob
(
"in"
);
const
Blob
*
size
=
ctx
->
BnInOp2Blob
(
"size"
);
Blob
*
out
=
ctx
->
BnInOp2Blob
(
"out"
);
AutoMemcpy
(
ctx
->
stream
(),
out
->
mut_dptr
(),
in
->
dptr
(),
in
->
ByteSizeOfBlobBody
(),
out
->
mem_case
(),
in
->
mem_case
());
AutoMemcpy
(
ctx
->
stream
(),
cuda_host_mem_ptr
->
Ptr
(),
size
->
dptr
(),
sizeof
(
SizeType
),
MakeHostMemCase
(),
size
->
mem_case
());
const
auto
&
UpdateShape
=
[
out
,
cuda_host_mem_ptr
,
conf
,
this
]()
{
const
int64_t
new_size
=
*
reinterpret_cast
<
SizeType
*>
(
cuda_host_mem_ptr
->
Ptr
());
CHECK_GE
(
new_size
,
0
);
CHECK_LE
(
new_size
,
out
->
shape_view
().
At
(
conf
.
axis
()));
// NOTE(Liang Depeng): `mut_shape_view` should be used here to get the blob's `MutShapeView`
// pointer. But this callback is called after `Kernel::Forward` function's
// execution and the header check is already been set to false at that
// moment. So we have to choose the `ForceMutShapeView` function with
// header checker disabled.
out
->
ForceMutShapeView
()
->
Set
(
conf
.
axis
(),
new_size
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
queue_
.
push
(
cuda_host_mem_ptr
);
};
if
(
conf
.
eager
())
{
CHECK_JUST
(
ctx
->
stream
()
->
Sync
());
UpdateShape
();
}
else
{
auto
*
actor_context_provider
=
CHECK_NOTNULL
(
dynamic_cast
<
ActorContextProvider
*>
(
ctx
));
actor_context_provider
->
GetActorContext
()
->
AddCallback
(
UpdateShape
);
}
}
mutable
std
::
queue
<
std
::
shared_ptr
<
CudaHostMem
>>
queue_
;
mutable
std
::
mutex
mutex_
;
};
#define REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL(stype) \
NEW_REGISTER_KERNEL(OperatorConf::kSyncDynamicResizeConf, SyncDynamicResizeGPUKernel<stype>) \
.SetIsMatchedPred([](const KernelConf& kernel_conf) { \
return (kernel_conf.op_attribute().op_conf().device_tag() == "cuda" \
&& GetDataType<stype>::value \
== kernel_conf.sync_dynamic_resize_conf().size_data_type()); \
})
REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL
(
int8_t
);
REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL
(
int32_t
);
REGISTER_SYNC_DYNAMIC_RESIZE_GPU_KERNEL
(
int64_t
);
#endif // WITH_ROCM
template
<
typename
SizeType
>
template
<
typename
SizeType
>
class
SyncDynamicResizeCPUKernel
final
:
public
Kernel
{
class
SyncDynamicResizeCPUKernel
final
:
public
Kernel
{
public:
public:
...
...
oneflow/core/kernel/user_kernel.h
View file @
460b49f4
...
@@ -29,6 +29,12 @@ limitations under the License.
...
@@ -29,6 +29,12 @@ limitations under the License.
#endif // WITH_CUDA
#endif // WITH_CUDA
#ifdef WITH_ROCM
#include "oneflow/core/ep/rocm/cuda_stream.h"
#endif // WITH_ROCM
namespace
oneflow
{
namespace
oneflow
{
class
UserKernelComputeContext
;
class
UserKernelComputeContext
;
...
...
oneflow/core/lazy/actor/light_actor.cpp
View file @
460b49f4
...
@@ -35,6 +35,12 @@ limitations under the License.
...
@@ -35,6 +35,12 @@ limitations under the License.
#endif // WITH_CUDA
#endif // WITH_CUDA
#ifdef WITH_ROCM
#include "oneflow/core/ep/rocm/cuda_stream.h"
#endif // WITH_ROCM
namespace
oneflow
{
namespace
oneflow
{
namespace
{
namespace
{
...
...
oneflow/core/lazy/actor/naive_actor.cpp
View file @
460b49f4
...
@@ -39,5 +39,8 @@ REGISTER_ACTOR(TaskType::kCriticalSectionWaitTick, NaiveActor);
...
@@ -39,5 +39,8 @@ REGISTER_ACTOR(TaskType::kCriticalSectionWaitTick, NaiveActor);
#ifdef WITH_CUDA
#ifdef WITH_CUDA
REGISTER_ACTOR
(
TaskType
::
kCopyHd
,
NaiveActor
);
REGISTER_ACTOR
(
TaskType
::
kCopyHd
,
NaiveActor
);
#endif
#endif
#ifdef WITH_ROCM
REGISTER_ACTOR
(
TaskType
::
kCopyHd
,
NaiveActor
);
#endif
REGISTER_ACTOR
(
TaskType
::
kCollectiveBoxingGeneric
,
NaiveActor
);
REGISTER_ACTOR
(
TaskType
::
kCollectiveBoxingGeneric
,
NaiveActor
);
}
// namespace oneflow
}
// namespace oneflow
oneflow/core/vm/bin_allocator_test.cpp
View file @
460b49f4
...
@@ -129,3 +129,122 @@ TEST(CudaBinAllocator, cuda_allocator) {
...
@@ -129,3 +129,122 @@ TEST(CudaBinAllocator, cuda_allocator) {
}
// namespace oneflow
}
// namespace oneflow
#endif // WITH_CUDA
#endif // WITH_CUDA
#ifdef WITH_ROCM
#include "gtest/gtest.h"
#include "oneflow/core/vm/bin_allocator.h"
#include "oneflow/core/vm/thread_safe_allocator.h"
#include "oneflow/core/device/cuda_util.h"
namespace
oneflow
{
namespace
vm
{
class
CudaBackendAllocator
final
:
public
Allocator
{
public:
explicit
CudaBackendAllocator
(
int64_t
device_id
)
:
device_id_
(
device_id
)
{}
~
CudaBackendAllocator
()
override
=
default
;
Maybe
<
void
>
Allocate
(
char
**
mem_ptr
,
std
::
size_t
size
)
override
;
void
Deallocate
(
char
*
mem_ptr
,
std
::
size_t
size
)
override
;
void
DeviceReset
()
override
;
private:
int64_t
device_id_
;
};
Maybe
<
void
>
CudaBackendAllocator
::
Allocate
(
char
**
mem_ptr
,
std
::
size_t
size
)
{
hipSetDevice
(
device_id_
);
if
(
hipMalloc
(
mem_ptr
,
size
)
!=
hipSuccess
)
{
*
mem_ptr
=
nullptr
;
}
return
Maybe
<
void
>::
Ok
();
}
void
CudaBackendAllocator
::
Deallocate
(
char
*
mem_ptr
,
std
::
size_t
size
)
{
hipSetDevice
(
device_id_
);
OF_CUDA_CHECK
(
hipFree
(
mem_ptr
));
}
void
CudaBackendAllocator
::
DeviceReset
()
{
hipSetDevice
(
device_id_
);
// NOTE(chengcheng): In some corner case on ubuntu, cuda memory not released even if OOM.
// So there need release all cuda memory allocated by this process before core dump.
LOG
(
WARNING
)
<<
"OOM error is detected, process will exit. And it will start to reset CUDA "
<<
"device for releasing device memory."
;
OF_CUDA_CHECK
(
hipDeviceReset
());
}
TEST
(
CudaBinAllocator
,
cuda_allocator
)
{
int
gpu_num
=
-
1
;
hipGetDeviceCount
(
&
gpu_num
);
if
(
gpu_num
<=
0
)
{
LOG
(
INFO
)
<<
"CudaBinAllocator Test: Skip because of non GPU device."
;
return
;
}
ASSERT_TRUE
(
hipSuccess
==
hipSetDevice
(
0
));
size_t
free_bytes
=
-
1
;
size_t
total_bytes
=
-
1
;
const
size_t
remain_bytes
=
50
*
1048576
;
ASSERT_TRUE
(
hipSuccess
==
hipMemGetInfo
(
&
free_bytes
,
&
total_bytes
));
if
(
free_bytes
<=
remain_bytes
||
free_bytes
-
remain_bytes
<
remain_bytes
)
{
LOG
(
INFO
)
<<
"CudaBinAllocator Test: Skip because of allocator mem bytes less than 50MiB in GPU 0"
;
return
;
}
std
::
unique_ptr
<
Allocator
>
allo
(
new
BinAllocator
(
kCudaMemAllocAlignSize
,
std
::
make_unique
<
CudaBackendAllocator
>
(
0
)));
allo
.
reset
(
new
SingleThreadOnlyAllocator
(
std
::
move
(
allo
)));
Allocator
*
a
=
allo
.
get
();
std
::
vector
<
char
*>
ptrs
;
for
(
int
i
=
0
;
i
<
512
;
++
i
)
{
char
*
ptr
=
nullptr
;
CHECK_JUST
(
a
->
Allocate
(
&
ptr
,
1
));
ASSERT_TRUE
(
ptr
!=
nullptr
);
ptrs
.
emplace_back
(
ptr
);
}
std
::
sort
(
ptrs
.
begin
(),
ptrs
.
end
());
for
(
int
i
=
0
;
i
<
512
;
++
i
)
{
if
(
i
>
0
)
{
ASSERT_TRUE
(
ptrs
.
at
(
i
)
!=
ptrs
.
at
(
i
-
1
));
ASSERT_TRUE
(
std
::
abs
(
ptrs
.
at
(
i
)
-
ptrs
.
at
(
i
-
1
))
>=
kCudaMemAllocAlignSize
);
}
a
->
Deallocate
(
ptrs
.
at
(
i
),
1
);
}
ptrs
.
clear
();
for
(
int
i
=
0
;
i
<
2048
;
++
i
)
{
char
*
ptr
=
nullptr
;
CHECK_JUST
(
a
->
Allocate
(
&
ptr
,
10000
));
ASSERT_TRUE
(
ptr
!=
nullptr
);
ptrs
.
emplace_back
(
ptr
);
}
std
::
sort
(
ptrs
.
begin
(),
ptrs
.
end
());
for
(
int
i
=
0
;
i
<
2048
;
++
i
)
{
if
(
i
>
0
)
{
ASSERT_TRUE
(
ptrs
.
at
(
i
)
!=
ptrs
.
at
(
i
-
1
));
ASSERT_TRUE
(
std
::
abs
(
ptrs
.
at
(
i
)
-
ptrs
.
at
(
i
-
1
))
>=
kCudaMemAllocAlignSize
);
}
a
->
Deallocate
(
ptrs
.
at
(
i
),
10000
);
}
char
*
data_ptr_1
=
nullptr
;
CHECK_JUST
(
a
->
Allocate
(
&
data_ptr_1
,
2048
*
sizeof
(
float
)));
char
*
data_ptr_2
=
nullptr
;
CHECK_JUST
(
a
->
Allocate
(
&
data_ptr_2
,
4096
*
sizeof
(
double
)));
ASSERT_TRUE
(
data_ptr_1
!=
data_ptr_2
);
if
(
data_ptr_1
<
data_ptr_2
)
{
ASSERT_TRUE
(
data_ptr_1
+
2048
*
sizeof
(
float
)
<=
data_ptr_2
);
}
else
{
ASSERT_TRUE
(
data_ptr_2
+
4096
*
sizeof
(
double
)
<=
data_ptr_1
);
}
a
->
Deallocate
(
data_ptr_2
,
4096
*
sizeof
(
double
));
a
->
Deallocate
(
data_ptr_1
,
2048
*
sizeof
(
float
));
}
}
// namespace vm
}
// namespace oneflow
#endif // WITH_ROCM
oneflow/core/vm/cuda_backend_allocator.cpp
View file @
460b49f4
...
@@ -50,3 +50,41 @@ void CudaBackendAllocator::DeviceReset() {
...
@@ -50,3 +50,41 @@ void CudaBackendAllocator::DeviceReset() {
}
// namespace oneflow
}
// namespace oneflow
#endif
#endif
#ifdef WITH_ROCM
#include "oneflow/core/vm/cuda_backend_allocator.h"
#include "oneflow/core/device/cuda_util.h"
#include <iostream>
namespace
oneflow
{
namespace
vm
{
Maybe
<
void
>
CudaBackendAllocator
::
Allocate
(
char
**
mem_ptr
,
std
::
size_t
size
)
{
hipSetDevice
(
device_id_
);
if
(
hipMalloc
(
mem_ptr
,
size
)
!=
hipSuccess
)
{
*
mem_ptr
=
nullptr
;
return
Error
::
OutOfMemoryError
()
<<
"cuda allocator out of memory"
;
}
return
Maybe
<
void
>::
Ok
();
}
void
CudaBackendAllocator
::
Deallocate
(
char
*
mem_ptr
,
std
::
size_t
size
)
{
hipSetDevice
(
device_id_
);
OF_CUDA_CHECK
(
hipFree
(
mem_ptr
));
}
void
CudaBackendAllocator
::
DeviceReset
()
{
hipSetDevice
(
device_id_
);
// NOTE(chengcheng): In some corner case on ubuntu, cuda memory not released even if OOM.
// So there need release all cuda memory allocated by this process before core dump.
LOG
(
WARNING
)
<<
"OOM error is detected, process will exit. And it will start to reset CUDA "
<<
"device for releasing device memory."
;
OF_CUDA_CHECK
(
hipDeviceReset
());
}
}
// namespace vm
}
// namespace oneflow
#endif
oneflow/core/vm/cuda_host_allocator.cpp
View file @
460b49f4
...
@@ -63,3 +63,54 @@ COMMAND(Singleton<CudaHostAllocator>::SetAllocated(new CudaHostAllocator(0)));
...
@@ -63,3 +63,54 @@ COMMAND(Singleton<CudaHostAllocator>::SetAllocated(new CudaHostAllocator(0)));
}
// namespace oneflow
}
// namespace oneflow
#endif
#endif
#ifdef WITH_ROCM
#include "oneflow/core/vm/cuda_host_allocator.h"
#include "oneflow/core/device/cuda_util.h"
namespace
oneflow
{
namespace
vm
{
CudaHostAllocator
::~
CudaHostAllocator
()
{
CudaCurrentDeviceGuard
guard
(
device_id_
);
for
(
const
auto
&
ptr_vec
:
granularity2free_ptrs_
)
{
for
(
char
*
ptr
:
ptr_vec
)
{
OF_CUDA_CHECK
(
hipHostFree
(
ptr
));
}
}
for
(
const
auto
&
pair
:
occupied_ptr2granularity_
)
{
OF_CUDA_CHECK
(
hipHostFree
(
pair
.
first
));
}
}
Maybe
<
void
>
CudaHostAllocator
::
Allocate
(
char
**
mem_ptr
,
std
::
size_t
size
)
{
std
::
size_t
granularity
=
std
::
ceil
(
std
::
log2
(
size
));
CHECK_GE_OR_RETURN
(
granularity
,
0
)
<<
"out of range"
;
CHECK_LT_OR_RETURN
(
granularity
,
kCudaHostMaxGranularity
)
<<
"invalid granularity"
;
CHECK_LE_OR_RETURN
(
size
,
1
<<
granularity
)
<<
"out of range"
;
CudaCurrentDeviceGuard
guard
(
device_id_
);
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
auto
*
vec
=
&
granularity2free_ptrs_
[
granularity
];
if
(
vec
->
empty
())
{
char
*
ptr
=
nullptr
;
OF_CUDA_CHECK
(
hipMallocHost
(
reinterpret_cast
<
void
**>
(
&
ptr
),
1
<<
granularity
));
vec
->
emplace_back
(
ptr
);
}
*
mem_ptr
=
vec
->
back
();
vec
->
pop_back
();
occupied_ptr2granularity_
[
*
mem_ptr
]
=
granularity
;
return
Maybe
<
void
>::
Ok
();
}
void
CudaHostAllocator
::
Deallocate
(
char
*
mem_ptr
,
std
::
size_t
size
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
auto
iter
=
occupied_ptr2granularity_
.
find
(
mem_ptr
);
CHECK
(
iter
!=
occupied_ptr2granularity_
.
end
());
std
::
size_t
granularity
=
iter
->
second
;
occupied_ptr2granularity_
.
erase
(
iter
);
granularity2free_ptrs_
[
granularity
].
emplace_back
(
mem_ptr
);
}
COMMAND
(
Singleton
<
CudaHostAllocator
>::
SetAllocated
(
new
CudaHostAllocator
(
0
)));
}
// namespace vm
}
// namespace oneflow
#endif
oneflow/core/vm/ep_backend_allocator.cpp
View file @
460b49f4
...
@@ -39,6 +39,16 @@ void EpBackendAllocator::DeviceReset() {
...
@@ -39,6 +39,16 @@ void EpBackendAllocator::DeviceReset() {
OF_CUDA_CHECK
(
cudaDeviceReset
());
OF_CUDA_CHECK
(
cudaDeviceReset
());
}
}
#endif
#endif
#ifdef WITH_ROCM
if
(
ep_device_
->
device_type
()
==
DeviceType
::
kCUDA
)
{
ep_device_
->
SetAsActiveDevice
();
// NOTE(chengcheng): In some corner case on ubuntu, cuda memory not released even if OOM.
// So there need release all cuda memory allocated by this process before core dump.
LOG
(
WARNING
)
<<
"OOM error is detected, process will exit. And it will start to reset CUDA "
<<
"device for releasing device memory."
;
OF_CUDA_CHECK
(
hipDeviceReset
());
}
#endif
}
}
}
// namespace vm
}
// namespace vm
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment