Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Oneflow
Commits
f262efc9
"vscode:/vscode.git/clone" did not exist on "6880673115182c7e26f96257ff81f5d7985d18ce"
Commit
f262efc9
authored
Nov 21, 2022
by
yuguo
Browse files
Surpport profiler for DCU, surpport debug compiler
parent
3f56062c
Changes
17
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
2666 additions
and
2387 deletions
+2666
-2387
CMakeLists.txt
CMakeLists.txt
+2
-2
cmake/third_party.cmake
cmake/third_party.cmake
+2
-0
oneflow/core/profiler/event.cpp
oneflow/core/profiler/event.cpp
+90
-94
oneflow/core/profiler/event.h
oneflow/core/profiler/event.h
+186
-188
oneflow/core/profiler/event_recorder.cpp
oneflow/core/profiler/event_recorder.cpp
+2
-2
oneflow/core/profiler/event_recorder.h
oneflow/core/profiler/event_recorder.h
+60
-60
oneflow/core/profiler/kernel.cpp
oneflow/core/profiler/kernel.cpp
+64
-0
oneflow/core/profiler/kineto_shim.cpp
oneflow/core/profiler/kineto_shim.cpp
+1
-1
oneflow/core/profiler/kineto_shim.h
oneflow/core/profiler/kineto_shim.h
+1
-1
oneflow/core/profiler/profile_manager.cpp
oneflow/core/profiler/profile_manager.cpp
+5
-6
oneflow/core/profiler/profile_manager.h
oneflow/core/profiler/profile_manager.h
+1
-1
oneflow/core/profiler/profiler.cpp
oneflow/core/profiler/profiler.cpp
+39
-0
oneflow/user/kernels/math_unary_elementwise_func.h
oneflow/user/kernels/math_unary_elementwise_func.h
+983
-983
oneflow/user/kernels/nvtx_range_kernel.hip.cpp
oneflow/user/kernels/nvtx_range_kernel.hip.cpp
+138
-0
oneflow/user/kernels/stateful_opkernel.cpp
oneflow/user/kernels/stateful_opkernel.cpp
+901
-901
python/oneflow/test/modules/fused_dot_feature_interaction.py
python/oneflow/test/modules/fused_dot_feature_interaction.py
+43
-0
python/oneflow/test/profiler/test_profile_lenet.py
python/oneflow/test/profiler/test_profile_lenet.py
+148
-148
No files found.
CMakeLists.txt
View file @
f262efc9
...
...
@@ -265,9 +265,9 @@ set(ROBIN_HOOD_HASHING_URL
use_mirror
(
VARIABLE ROBIN_HOOD_HASHING_URL URL
${
ROBIN_HOOD_HASHING_URL
}
)
set
(
ROBIN_HOOD_HASHING_MD5 a78bd30a7582f25984f8592652836467
)
set
(
FMT_URL https://github.com/fmtlib/fmt/archive/
48b7e3dafb27ece02cd6addc8bd1041c79d59c2c
.zip
)
set
(
FMT_URL https://github.com/fmtlib/fmt/archive/
fc07217d85e6dcec52878807d6bbd89a9d9156a5
.zip
)
use_mirror
(
VARIABLE FMT_URL URL
${
FMT_URL
}
)
set
(
FMT_MD5
45925a979ed7195e0c88a70be691de09
)
set
(
FMT_MD5
7d9bb2ececc9ede29cd35bdc42a7e22c
)
set
(
KINETO_URL
https://github.com/pytorch/kineto/archive/ff8dba20499a660650632952be76450bd70a52a6.zip
)
...
...
cmake/third_party.cmake
View file @
f262efc9
...
...
@@ -175,6 +175,8 @@ if (BUILD_ROCM)
add_definitions
(
-D__HIP_PLATFORM_HCC__
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024"
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
-D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024"
)
set
(
CMAKE_CXX_FLAGS_DEBUG
"
${
CMAKE_CXX_FLAGS_DEBUG
}
-mcmodel=large"
)
set
(
CMAKE_C_FLAGS_DEBUG
"
${
CMAKE_C_FLAGS_DEBUG
}
-mcmodel=large"
)
list
(
APPEND oneflow_third_party_libs hip::device
)
list
(
APPEND oneflow_third_party_libs roc::hipblas
)
list
(
APPEND oneflow_third_party_libs hip::hipcub
)
...
...
oneflow/core/profiler/event.cpp
View file @
f262efc9
...
...
@@ -14,8 +14,8 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
//
#include "fmt/core.h"
//
#include "fmt/format.h"
#include "fmt/core.h"
#include "fmt/format.h"
#include "oneflow/core/profiler/event.h"
#include "oneflow/core/profiler/util.h"
...
...
@@ -58,14 +58,13 @@ std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, Custom
return
std
::
shared_ptr
<
CustomEvent
>
(
new
CustomEvent
(
name
,
type
));
}
// std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
std
::
string
KernelEvent
::
Key
()
{
return
"yuguo"
;
}
std
::
string
KernelEvent
::
Key
()
{
return
fmt
::
format
(
"{}.{}"
,
name_
,
GetFormatedInputShapes
());
}
nlohmann
::
json
KernelEvent
::
ToJson
()
{
auto
j
=
IEvent
::
ToJson
();
j
[
"type"
]
=
EventType
::
kOneflowKernel
;
j
[
"input_shapes"
]
=
GetFormatedInputShapes
();
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
j
[
"memory_size"
]
=
memory_size_
;
if
(
!
children_
.
empty
())
{
j
[
"children"
]
=
children_
;
}
#endif // WITH_CUDA
...
...
@@ -73,12 +72,10 @@ nlohmann::json KernelEvent::ToJson() {
}
std
::
shared_ptr
<
KernelEvent
>
KernelEvent
::
Create
(
const
std
::
string
&
name
,
const
std
::
function
<
std
::
vector
<
Shape
View
>
(
void
)
>&
shape_getter
)
{
const
std
::
string
&
name
,
const
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>&
shape_getter
)
{
return
std
::
shared_ptr
<
KernelEvent
>
(
new
KernelEvent
(
name
,
shape_getter
));
}
void
KernelEvent
::
RecordShape
(
const
ShapeView
&
shape
)
{
input_shapes_
.
emplace_back
(
shape
);
}
std
::
string
KernelEvent
::
GetFormatedInputShapes
(
size_t
max_num_to_format
)
{
if
(
input_shapes_
.
size
()
==
0
)
{
return
"-"
;
}
std
::
vector
<
std
::
string
>
shapes_formated
(
std
::
min
(
input_shapes_
.
size
(),
max_num_to_format
));
...
...
@@ -87,8 +84,7 @@ std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
shapes_formated
[
i
]
=
current_shape
==
"()"
?
"scalar"
:
current_shape
;
}
if
(
input_shapes_
.
size
()
>
max_num_to_format
)
{
shapes_formated
.
emplace_back
(
"..."
);
}
// return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
return
"yuguo"
;
return
fmt
::
format
(
"[{}]"
,
fmt
::
join
(
shapes_formated
,
", "
));
}
}
// namespace profiler
...
...
oneflow/core/profiler/event.h
View file @
f262efc9
...
...
@@ -138,11 +138,9 @@ class KernelEvent final : public IEvent {
nlohmann
::
json
ToJson
()
override
;
static
std
::
shared_ptr
<
KernelEvent
>
Create
(
const
std
::
string
&
name
,
const
std
::
function
<
std
::
vector
<
Shape
View
>
(
void
)
>&
shape_getter
);
const
std
::
string
&
name
,
const
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>&
shape_getter
);
void
RecordShape
(
const
ShapeView
&
shape
);
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
void
SetMemorySize
(
int64_t
memory_size
)
{
memory_size_
=
memory_size
;
}
void
AddChildEvent
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
children_
.
emplace
(
e
);
}
bool
AddChildEventIfSo
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
...
...
@@ -160,17 +158,17 @@ class KernelEvent final : public IEvent {
private:
KernelEvent
(
const
std
::
string
&
kernel_name
,
const
std
::
function
<
std
::
vector
<
Shape
View
>
(
void
)
>&
shape_getter
)
const
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>&
shape_getter
)
:
IEvent
(
kernel_name
,
EventTimeUnit
::
kNS
)
{
if
(
shape_getter
)
{
input_shapes_
=
shape_getter
();
}
}
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
int64_t
memory_size_
=
-
1
;
std
::
set
<
std
::
shared_ptr
<
IEvent
>>
children_
;
#endif // WITH_CUDA
std
::
vector
<
Shape
View
>
input_shapes_
;
std
::
vector
<
Shape
>
input_shapes_
;
std
::
string
GetFormatedInputShapes
(
size_t
max_num_to_format
=
4
);
};
...
...
oneflow/core/profiler/event_recorder.cpp
View file @
f262efc9
...
...
@@ -32,13 +32,13 @@ std::shared_ptr<EventRecorder> EventRecorder::CreateCustomEventRecorder(const st
Maybe
<
EventRecorder
>
EventRecorder
::
CreateKernelEventRecorder
(
const
std
::
string
&
name
,
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
const
std
::
function
<
int64_t
()
>&
memory_size_getter
,
#endif
const
ShapeGetterFuncType
&
shape_getter
)
{
auto
pmgr
=
Singleton
<
ProfileManager
>::
Get
();
if
(
pmgr
)
{
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
if
(
pmgr
->
use_cpu_
||
pmgr
->
use_cuda_
)
{
auto
event
=
KernelEvent
::
Create
(
name
,
pmgr
->
record_shapes_
?
shape_getter
:
nullptr
);
if
(
pmgr
->
use_cuda_
)
{
...
...
oneflow/core/profiler/event_recorder.h
View file @
f262efc9
...
...
@@ -24,7 +24,7 @@ namespace profiler {
class
EventRecorder
{
public:
using
ShapeGetterFuncType
=
std
::
function
<
std
::
vector
<
Shape
View
>
(
void
)
>
;
using
ShapeGetterFuncType
=
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>
;
OF_DISALLOW_COPY_AND_MOVE
(
EventRecorder
);
...
...
@@ -45,7 +45,7 @@ class EventRecorder {
static
Maybe
<
EventRecorder
>
CreateKernelEventRecorder
(
const
std
::
string
&
name
,
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
const
std
::
function
<
int64_t
()
>&
memory_size_getter
,
#endif
const
ShapeGetterFuncType
&
shape_getter
);
...
...
oneflow/core/profiler/kernel.cpp
View file @
f262efc9
...
...
@@ -17,7 +17,11 @@ limitations under the License.
#include "oneflow/core/profiler/kernel.h"
#include "oneflow/core/profiler/profiler.h"
#include "oneflow/core/kernel/kernel.h"
#ifdef WITH_ROCM
#include "oneflow/core/ep/rocm/cuda_stream.h"
#else
#include "oneflow/core/ep/cuda/cuda_stream.h"
#endif
#include "oneflow/core/lazy/actor/actor_context.h"
namespace
oneflow
{
...
...
@@ -43,6 +47,11 @@ thread_local cudaEvent_t cuda_memory_bandwidth_profile_start_event = nullptr;
thread_local
cudaEvent_t
cuda_memory_bandwidth_profile_end_event
=
nullptr
;
#endif // WITH_CUDA
#if defined(WITH_ROCM)
thread_local
hipEvent_t
cuda_memory_bandwidth_profile_start_event
=
nullptr
;
thread_local
hipEvent_t
cuda_memory_bandwidth_profile_end_event
=
nullptr
;
#endif // WITH_ROCM
}
// namespace
void
TraceKernelForwardDataContentStart
(
KernelContext
*
kernel_ctx
,
const
Kernel
*
kernel
)
{
...
...
@@ -61,6 +70,22 @@ void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel*
}
if
(
profile_kernel_forward_range
)
{
OF_PROFILER_RANGE_PUSH
(
kernel
->
op_conf
().
name
());
}
#endif // WITH_CUDA
#if defined(WITH_ROCM)
if
(
profile_cuda_memory_bandwidth
)
{
auto
*
actor_context_provider
=
dynamic_cast
<
ActorContextProvider
*>
(
kernel_ctx
);
auto
*
cuda_stream
=
dynamic_cast
<
ep
::
CudaStream
*>
(
kernel_ctx
->
stream
());
if
(
cuda_stream
!=
nullptr
&&
actor_context_provider
!=
nullptr
)
{
CHECK
(
cuda_memory_bandwidth_profile_start_event
==
nullptr
);
CHECK
(
cuda_memory_bandwidth_profile_end_event
==
nullptr
);
OF_CUDA_CHECK
(
hipEventCreate
(
&
cuda_memory_bandwidth_profile_start_event
));
OF_CUDA_CHECK
(
hipEventCreate
(
&
cuda_memory_bandwidth_profile_end_event
));
OF_CUDA_CHECK
(
hipEventRecord
(
cuda_memory_bandwidth_profile_start_event
,
cuda_stream
->
cuda_stream
()));
}
}
if
(
profile_kernel_forward_range
)
{
OF_PROFILER_RANGE_PUSH
(
kernel
->
op_conf
().
name
());
}
#endif // WITH_ROCM
}
void
TraceKernelForwardDataContentEnd
(
KernelContext
*
kernel_ctx
,
const
Kernel
*
kernel
)
{
...
...
@@ -103,6 +128,45 @@ void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* k
}
}
#endif // WITH_CUDA
#if defined(WITH_ROCM)
if
(
profile_kernel_forward_range
)
{
OF_PROFILER_RANGE_POP
();
}
// The memory bandwidth profiler only works in lazy mode.
if
(
profile_cuda_memory_bandwidth
)
{
auto
*
cuda_stream
=
dynamic_cast
<
ep
::
CudaStream
*>
(
kernel_ctx
->
stream
());
auto
*
actor_context_provider
=
dynamic_cast
<
ActorContextProvider
*>
(
kernel_ctx
);
if
(
cuda_stream
!=
nullptr
&&
actor_context_provider
!=
nullptr
)
{
hipEvent_t
start_event
=
cuda_memory_bandwidth_profile_start_event
;
hipEvent_t
end_event
=
cuda_memory_bandwidth_profile_end_event
;
cuda_memory_bandwidth_profile_start_event
=
nullptr
;
cuda_memory_bandwidth_profile_end_event
=
nullptr
;
CHECK_NOTNULL
(
start_event
);
CHECK_NOTNULL
(
end_event
);
OF_CUDA_CHECK
(
hipEventRecord
(
end_event
,
cuda_stream
->
cuda_stream
()));
int64_t
memory_size
=
0
;
for
(
const
auto
&
bn
:
kernel
->
op_attribute
().
input_bns
())
{
const
Blob
*
blob
=
kernel_ctx
->
BnInOp2Blob
(
bn
);
if
(
blob
)
{
memory_size
+=
blob
->
ByteSizeOfBlobBody
();
}
}
for
(
const
auto
&
bn
:
kernel
->
op_attribute
().
output_bns
())
{
const
Blob
*
blob
=
kernel_ctx
->
BnInOp2Blob
(
bn
);
if
(
blob
)
{
memory_size
+=
blob
->
ByteSizeOfBlobBody
();
}
}
const
std
::
string
op_name
=
kernel
->
op_conf
().
name
();
actor_context_provider
->
GetActorContext
()
->
AddCallback
(
[
start_event
,
end_event
,
memory_size
,
op_name
]()
{
float
elapsed_ms
=
0
;
OF_CUDA_CHECK
(
hipEventElapsedTime
(
&
elapsed_ms
,
start_event
,
end_event
));
OF_CUDA_CHECK
(
hipEventDestroy
(
start_event
));
OF_CUDA_CHECK
(
hipEventDestroy
(
end_event
));
double
bandwidth
=
static_cast
<
double
>
(
memory_size
)
/
(
1024.0
*
1024.0
*
1024.0
)
/
(
elapsed_ms
/
1000
);
LOG
(
INFO
)
<<
"PROFILER::KERNEL::CUDA_MEMORY_BANDWIDTH op_name: "
<<
op_name
<<
" elapsed(ms): "
<<
elapsed_ms
<<
" memory_size(Byte): "
<<
memory_size
<<
" bandwidth(GB/s): "
<<
bandwidth
;
});
}
}
#endif // WITH_ROCM
}
}
// namespace profiler
...
...
oneflow/core/profiler/kineto_shim.cpp
View file @
f262efc9
...
...
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
#include "oneflow/core/profiler/kineto_shim.h"
#include "libkineto.h"
...
...
oneflow/core/profiler/kineto_shim.h
View file @
f262efc9
...
...
@@ -16,7 +16,7 @@ limitations under the License.
#ifndef ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
#define ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
#include <string>
#include <memory>
...
...
oneflow/core/profiler/profile_manager.cpp
View file @
f262efc9
...
...
@@ -15,12 +15,12 @@ limitations under the License.
*/
#include <memory>
#include <unordered_map>
//
#include "fmt/core.h"
#include "fmt/core.h"
#include "nlohmann/json.hpp"
#include "oneflow/core/profiler/kineto_shim.h"
#include "oneflow/core/profiler/profile_manager.h"
#include "oneflow/core/profiler/event.h"
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
#include <libkineto.h>
#endif // WITH_CUDA
...
...
@@ -48,7 +48,7 @@ std::string ProfileManager::DumpResultsJson() {
}
std
::
vector
<
std
::
shared_ptr
<
IEvent
>>
ProfileManager
::
ExportEvents
()
{
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
auto
trace
=
StopTrace
();
const
auto
&
kineto_events
=
*
(
trace
.
get
()
->
activities
());
std
::
set
<
std
::
shared_ptr
<
IEvent
>>
custom_events
;
...
...
@@ -77,7 +77,7 @@ std::vector<std::shared_ptr<IEvent>> ProfileManager::ExportEvents() {
while
(
!
events_
.
empty
())
{
auto
evt
=
events_
.
front
();
events_
.
pop
();
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
auto
evt_kernel
=
std
::
dynamic_pointer_cast
<
KernelEvent
>
(
evt
);
if
(
evt_kernel
)
{
std
::
set
<
int64_t
>
current_corr_ids
;
...
...
@@ -106,8 +106,7 @@ std::string ProfileManager::GetNextEventRecorderKey(const std::string& name) {
}
else
{
event_recorders_last_id_
[
name
]
++
;
}
// return fmt::format("{}.{}", name, event_recorders_last_id_[name]);
return
"yuguo"
;
return
fmt
::
format
(
"{}.{}"
,
name
,
event_recorders_last_id_
[
name
]);
}
}
// namespace profiler
...
...
oneflow/core/profiler/profile_manager.h
View file @
f262efc9
...
...
@@ -37,7 +37,7 @@ class ProfileManager {
use_cuda_
(
use_cuda
),
record_shapes_
(
record_shapes
),
record_bandwidth_
(
record_bandwidth
)
{
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
std
::
set
<
ActivityType
>
activities
{};
if
(
use_cpu
)
{
activities
.
insert
(
ActivityType
::
CPU
);
}
if
(
use_cuda
)
{
activities
.
insert
(
ActivityType
::
CUDA
);
}
...
...
oneflow/core/profiler/profiler.cpp
View file @
f262efc9
...
...
@@ -20,11 +20,20 @@ limitations under the License.
#include "oneflow/core/profiler/event_recorder.h"
#include "oneflow/core/vm/vm_util.h"
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_profile.h>
#include <roctracer_roctx.h>
#include <sys/syscall.h>
#include <iostream>
#include "oneflow/core/device/cuda_util.h"
#else
#include <nvtx3/nvToolsExt.h>
#include <sys/syscall.h>
#include <iostream>
#include <cuda_profiler_api.h>
#include "oneflow/core/device/cuda_util.h"
#endif
#endif // OF_ENABLE_PROFILER
namespace
oneflow
{
...
...
@@ -33,6 +42,16 @@ namespace profiler {
void
NameThisHostThread
(
const
std
::
string
&
name
)
{
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
static
thread_local
std
::
unique_ptr
<
std
::
string
>
thread_name_prefix
;
if
(
!
thread_name_prefix
)
{
thread_name_prefix
.
reset
(
new
std
::
string
(
GetStringFromEnv
(
"ONEFLOW_PROFILER_HOST_THREAD_NAME_PREFIX"
,
""
)));
}
const
std
::
string
name_with_prefix
=
*
thread_name_prefix
+
name
;
// nvtxNameOsThreadA(syscall(SYS_gettid), name_with_prefix.c_str());
roctxMarkA
(
name_with_prefix
.
c_str
());
#else
static
thread_local
std
::
unique_ptr
<
std
::
string
>
thread_name_prefix
;
if
(
!
thread_name_prefix
)
{
thread_name_prefix
.
reset
(
...
...
@@ -40,18 +59,27 @@ void NameThisHostThread(const std::string& name) {
}
const
std
::
string
name_with_prefix
=
*
thread_name_prefix
+
name
;
nvtxNameOsThreadA
(
syscall
(
SYS_gettid
),
name_with_prefix
.
c_str
());
#endif
#endif // OF_ENABLE_PROFILER
}
void
RangePush
(
const
std
::
string
&
name
)
{
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
roctxRangePushA
(
name
.
c_str
());
#else
nvtxRangePushA
(
name
.
c_str
());
#endif
#endif // OF_ENABLE_PROFILER
}
void
RangePop
()
{
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
roctxRangePop
();
#else
nvtxRangePop
();
#endif
#endif // OF_ENABLE_PROFILER
}
...
...
@@ -82,13 +110,21 @@ void LogHostMemoryUsage(const std::string& name) {
void
ProfilerStart
()
{
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
OF_CUDA_CHECK
(
hipProfilerStart
());
#else
OF_CUDA_CHECK
(
cudaProfilerStart
());
#endif
#endif // OF_ENABLE_PROFILER
}
void
ProfilerStop
()
{
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
OF_CUDA_CHECK
(
hipProfilerStop
());
#else
OF_CUDA_CHECK
(
cudaProfilerStop
());
#endif
#endif // OF_ENABLE_PROFILER
}
...
...
@@ -105,6 +141,9 @@ Maybe<std::string> DisableProfilerAndReturnResult() {
#if defined(WITH_CUDA)
OF_CUDA_CHECK
(
cudaDeviceSynchronize
());
#endif // WITH_CUDA
#if defined(WITH_ROCM)
OF_CUDA_CHECK
(
hipDeviceSynchronize
());
#endif // WITH_ROCM
auto
*
pmgr
=
JUST
(
SingletonMaybe
<
ProfileManager
>
());
std
::
string
results
=
pmgr
->
DumpResultsJson
();
Singleton
<
ProfileManager
>::
Delete
();
...
...
oneflow/user/kernels/math_unary_elementwise_func.h
View file @
f262efc9
...
...
@@ -250,7 +250,7 @@ struct LgammaFunctor<float> {
static
OF_DEVICE_FUNC
float
Backward
(
const
float
x
,
const
float
dy
)
{
// TODO(chengcheng): return: dy * digamma(x)
assert
(
false
);
//
assert(false);
return
0.0
f
;
}
};
...
...
@@ -526,7 +526,7 @@ struct LgammaFunctor<double> {
static
OF_DEVICE_FUNC
double
Backward
(
const
double
x
,
const
double
dy
)
{
// TODO(chengcheng): return: dy * digamma(x)
assert
(
false
);
//
assert(false);
return
0.0
;
}
};
...
...
@@ -817,7 +817,7 @@ struct LgammaFunctor<half> {
static
OF_HALF_FUNC
half
Backward
(
const
half
x
,
const
half
dy
)
{
// TODO(chengcheng): return: dy * digamma(x)
assert
(
false
);
//
assert(false);
return
GetZeroVal
<
half
>
();
}
};
...
...
oneflow/user/kernels/nvtx_range_kernel.hip.cpp
0 → 100644
View file @
f262efc9
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#ifdef OF_ENABLE_PROFILER
#include <roctracer_roctx.h>
#endif // OF_ENABLE_PROFILER
namespace
oneflow
{
namespace
{
#ifdef OF_ENABLE_PROFILER
static
thread_local
HashMap
<
std
::
string
,
roctx_range_id_t
>
mark2range_id
;
#endif
}
// namespace
class
NvtxOpKernelState
final
:
public
user_op
::
OpKernelState
{
public:
NvtxOpKernelState
()
:
counter_
(
0
)
{
#ifndef OF_ENABLE_PROFILER
LOG
(
WARNING
)
<<
"To use NVTX, run cmake with -DBUILD_PROFILER=ON"
;
#endif
}
~
NvtxOpKernelState
()
override
=
default
;
int64_t
counter
()
const
{
return
counter_
;
}
void
IncreaseCount
()
{
counter_
+=
1
;
}
private:
int64_t
counter_
;
};
class
NvtxStartKernel
final
:
public
user_op
::
OpKernel
{
public:
NvtxStartKernel
()
=
default
;
~
NvtxStartKernel
()
override
=
default
;
std
::
shared_ptr
<
user_op
::
OpKernelState
>
CreateOpKernelState
(
user_op
::
KernelInitContext
*
ctx
)
const
override
{
return
std
::
make_shared
<
NvtxOpKernelState
>
();
}
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
,
user_op
::
OpKernelState
*
state
,
const
user_op
::
OpKernelCache
*
)
const
override
{
const
user_op
::
Tensor
*
in
=
ctx
->
Tensor4ArgNameAndIndex
(
"in"
,
0
);
user_op
::
Tensor
*
out
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
const
ShapeView
&
in_shape
=
in
->
shape_view
();
CHECK_EQ
(
out
->
shape_view
(),
in_shape
);
const
DataType
in_data_type
=
in
->
data_type
();
CHECK_EQ
(
out
->
data_type
(),
in_data_type
);
Memcpy
<
DeviceType
::
kCUDA
>
(
ctx
->
stream
(),
out
->
mut_dptr
<
void
>
(),
in
->
dptr
<
void
>
(),
in_shape
.
elem_cnt
()
*
GetSizeOfDataType
(
in_data_type
));
#ifdef OF_ENABLE_PROFILER
auto
*
kernel_state
=
dynamic_cast
<
NvtxOpKernelState
*>
(
state
);
const
std
::
string
mark_prefix
=
ctx
->
Attr
<
std
::
string
>
(
"mark_prefix"
);
const
std
::
string
mark
=
mark_prefix
+
"-"
+
std
::
to_string
(
kernel_state
->
counter
());
roctx_range_id_t
range_id
=
roctxRangeStartA
(
mark
.
c_str
());
CHECK
(
mark2range_id
.
emplace
(
mark
,
range_id
).
second
);
kernel_state
->
IncreaseCount
();
#endif // OF_ENABLE_PROFILER
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
REGISTER_USER_KERNEL
(
"nvtx_start"
)
.
SetCreateFn
<
NvtxStartKernel
>
()
.
SetIsMatchedHob
(
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
.
SetInplaceProposalFn
([](
const
user_op
::
InferContext
&
,
user_op
::
AddInplaceArgPair
AddInplaceArgPairFn
)
->
Maybe
<
void
>
{
OF_RETURN_IF_ERROR
(
AddInplaceArgPairFn
(
"out"
,
0
,
"in"
,
0
,
false
));
return
Maybe
<
void
>::
Ok
();
});
class
NvtxEndKernel
final
:
public
user_op
::
OpKernel
{
public:
NvtxEndKernel
()
=
default
;
~
NvtxEndKernel
()
override
=
default
;
std
::
shared_ptr
<
user_op
::
OpKernelState
>
CreateOpKernelState
(
user_op
::
KernelInitContext
*
ctx
)
const
override
{
return
std
::
make_shared
<
NvtxOpKernelState
>
();
}
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
,
user_op
::
OpKernelState
*
state
,
const
user_op
::
OpKernelCache
*
)
const
override
{
const
user_op
::
Tensor
*
in
=
ctx
->
Tensor4ArgNameAndIndex
(
"in"
,
0
);
user_op
::
Tensor
*
out
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
const
ShapeView
&
in_shape
=
in
->
shape_view
();
CHECK_EQ
(
out
->
shape_view
(),
in_shape
);
const
DataType
in_data_type
=
in
->
data_type
();
CHECK_EQ
(
out
->
data_type
(),
in_data_type
);
#ifdef OF_ENABLE_PROFILER
auto
*
kernel_state
=
dynamic_cast
<
NvtxOpKernelState
*>
(
state
);
const
std
::
string
mark_prefix
=
ctx
->
Attr
<
std
::
string
>
(
"mark_prefix"
);
const
std
::
string
mark
=
mark_prefix
+
"-"
+
std
::
to_string
(
kernel_state
->
counter
());
auto
it
=
mark2range_id
.
find
(
mark
.
c_str
());
CHECK
(
it
!=
mark2range_id
.
end
());
roctx_range_id_t
range_id
=
it
->
second
;
mark2range_id
.
erase
(
it
);
roctxRangeStop
(
range_id
);
Memcpy
<
DeviceType
::
kCUDA
>
(
ctx
->
stream
(),
out
->
mut_dptr
<
void
>
(),
in
->
dptr
<
void
>
(),
in_shape
.
elem_cnt
()
*
GetSizeOfDataType
(
in_data_type
));
kernel_state
->
IncreaseCount
();
#endif
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
REGISTER_USER_KERNEL
(
"nvtx_end"
)
.
SetCreateFn
<
NvtxEndKernel
>
()
.
SetIsMatchedHob
(
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
.
SetInplaceProposalFn
([](
const
user_op
::
InferContext
&
,
user_op
::
AddInplaceArgPair
AddInplaceArgPairFn
)
->
Maybe
<
void
>
{
OF_RETURN_IF_ERROR
(
AddInplaceArgPairFn
(
"out"
,
0
,
"in"
,
0
,
false
));
return
Maybe
<
void
>::
Ok
();
});
}
// namespace oneflow
oneflow/user/kernels/stateful_opkernel.cpp
View file @
f262efc9
...
...
@@ -867,7 +867,7 @@ void StatefulOpKernel::Compute(eager::CallContext* call_ctx, DeviceCtx* device_c
auto
*
compute_ctx
=
&
compute_context
;
OF_PROFILER_RANGE_GUARD
(
"Compute"
);
if
(
Singleton
<
profiler
::
ProfileManager
>::
Get
())
{
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
const
auto
CalMemorySize
=
[
compute_ctx
](
const
one
::
ArgVec
&
args
)
->
int64_t
{
const
auto
Func
=
[
compute_ctx
](
int64_t
mem_size
,
const
auto
&
pair
)
{
const
auto
tensor
=
compute_ctx
->
Tensor4ArgNameAndIndex
(
pair
.
first
,
pair
.
second
);
...
...
@@ -878,13 +878,13 @@ void StatefulOpKernel::Compute(eager::CallContext* call_ctx, DeviceCtx* device_c
#endif
auto
er_guard
=
CHECK_JUST
(
profiler
::
EventRecorder
::
CreateKernelEventRecorder
(
op_type_name
(),
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
[
compute_ctx
,
CalMemorySize
]()
->
int64_t
{
return
CalMemorySize
(
compute_ctx
->
inputs
())
+
CalMemorySize
(
compute_ctx
->
outputs
());
},
#endif
[
compute_ctx
]()
->
std
::
vector
<
Shape
View
>
{
std
::
vector
<
Shape
View
>
shapes
;
[
compute_ctx
]()
->
std
::
vector
<
Shape
>
{
std
::
vector
<
Shape
>
shapes
;
for
(
const
auto
&
pair
:
compute_ctx
->
inputs
())
{
shapes
.
emplace_back
(
compute_ctx
->
TensorDesc4ArgNameAndIndex
(
pair
.
first
,
pair
.
second
)
->
shape
());
...
...
python/oneflow/test/modules/fused_dot_feature_interaction.py
0 → 100644
View file @
f262efc9
import
numpy
as
np
import
oneflow
as
flow
def
fused_dot_feature_interaction
(
x
,
y
,
self_interaction
=
False
,
output_padding
=
0
,
output_concat
=
None
,
dtype
=
flow
.
float32
):
# (bs, es) = x.shape
(
bs
,
dims
,
es
)
=
y
.
shape
if
self_interaction
:
offset
=
1
else
:
offset
=
0
li
=
flow
.
tensor
([
i
for
i
in
range
(
dims
+
1
)
for
j
in
range
(
i
+
offset
)])
lj
=
flow
.
tensor
([
j
for
i
in
range
(
dims
+
1
)
for
j
in
range
(
i
+
offset
)])
T
=
flow
.
cat
(
[
flow
.
reshape
(
x
,
(
bs
,
1
,
es
)),
y
,
],
dim
=
1
,
)
Z
=
flow
.
matmul
(
T
,
T
,
transpose_b
=
True
)
# gather_nd not support half, so cast to float32
Z
=
flow
.
cast
(
Z
,
flow
.
float32
)
Zflat
=
Z
[:,
li
,
lj
]
Zflat
=
flow
.
cast
(
Zflat
,
dtype
)
if
output_concat
is
not
None
:
R
=
flow
.
cat
([
output_concat
,
Zflat
],
dim
=
1
)
else
:
R
=
Zflat
if
output_padding
!=
0
:
padding_tensor
=
flow
.
tensor
(
np
.
zeros
((
bs
,
output_padding
)).
astype
(
np
.
float32
),
device
=
"cuda"
,
requires_grad
=
False
,
)
R
=
flow
.
cat
([
R
,
padding_tensor
],
dim
=
1
)
return
R
python/oneflow/test/profiler/test_profile_lenet.py
View file @
f262efc9
...
...
@@ -80,7 +80,7 @@ def _test_lenet(
with
oneflow
.
profiler
.
record_function
(
"lenet_backward_total_time"
)
as
f
:
eager_res
.
sum
().
backward
()
events
=
prof
.
key_averages
(
group_by_input_shape
=
True
)
print
(
events
)
conv_event
=
get_event
(
events
,
"conv2d"
,
"[(2,3,32,32), (6,3,5,5)]"
if
record_shapes
else
"-"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment