Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Oneflow
Commits
f262efc9
Commit
f262efc9
authored
Nov 21, 2022
by
yuguo
Browse files
Surpport profiler for DCU, surpport debug compiler
parent
3f56062c
Changes
17
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
2666 additions
and
2387 deletions
+2666
-2387
CMakeLists.txt
CMakeLists.txt
+2
-2
cmake/third_party.cmake
cmake/third_party.cmake
+2
-0
oneflow/core/profiler/event.cpp
oneflow/core/profiler/event.cpp
+90
-94
oneflow/core/profiler/event.h
oneflow/core/profiler/event.h
+186
-188
oneflow/core/profiler/event_recorder.cpp
oneflow/core/profiler/event_recorder.cpp
+2
-2
oneflow/core/profiler/event_recorder.h
oneflow/core/profiler/event_recorder.h
+60
-60
oneflow/core/profiler/kernel.cpp
oneflow/core/profiler/kernel.cpp
+64
-0
oneflow/core/profiler/kineto_shim.cpp
oneflow/core/profiler/kineto_shim.cpp
+1
-1
oneflow/core/profiler/kineto_shim.h
oneflow/core/profiler/kineto_shim.h
+1
-1
oneflow/core/profiler/profile_manager.cpp
oneflow/core/profiler/profile_manager.cpp
+5
-6
oneflow/core/profiler/profile_manager.h
oneflow/core/profiler/profile_manager.h
+1
-1
oneflow/core/profiler/profiler.cpp
oneflow/core/profiler/profiler.cpp
+39
-0
oneflow/user/kernels/math_unary_elementwise_func.h
oneflow/user/kernels/math_unary_elementwise_func.h
+983
-983
oneflow/user/kernels/nvtx_range_kernel.hip.cpp
oneflow/user/kernels/nvtx_range_kernel.hip.cpp
+138
-0
oneflow/user/kernels/stateful_opkernel.cpp
oneflow/user/kernels/stateful_opkernel.cpp
+901
-901
python/oneflow/test/modules/fused_dot_feature_interaction.py
python/oneflow/test/modules/fused_dot_feature_interaction.py
+43
-0
python/oneflow/test/profiler/test_profile_lenet.py
python/oneflow/test/profiler/test_profile_lenet.py
+148
-148
No files found.
CMakeLists.txt
View file @
f262efc9
...
@@ -265,9 +265,9 @@ set(ROBIN_HOOD_HASHING_URL
...
@@ -265,9 +265,9 @@ set(ROBIN_HOOD_HASHING_URL
use_mirror
(
VARIABLE ROBIN_HOOD_HASHING_URL URL
${
ROBIN_HOOD_HASHING_URL
}
)
use_mirror
(
VARIABLE ROBIN_HOOD_HASHING_URL URL
${
ROBIN_HOOD_HASHING_URL
}
)
set
(
ROBIN_HOOD_HASHING_MD5 a78bd30a7582f25984f8592652836467
)
set
(
ROBIN_HOOD_HASHING_MD5 a78bd30a7582f25984f8592652836467
)
set
(
FMT_URL https://github.com/fmtlib/fmt/archive/
48b7e3dafb27ece02cd6addc8bd1041c79d59c2c
.zip
)
set
(
FMT_URL https://github.com/fmtlib/fmt/archive/
fc07217d85e6dcec52878807d6bbd89a9d9156a5
.zip
)
use_mirror
(
VARIABLE FMT_URL URL
${
FMT_URL
}
)
use_mirror
(
VARIABLE FMT_URL URL
${
FMT_URL
}
)
set
(
FMT_MD5
45925a979ed7195e0c88a70be691de09
)
set
(
FMT_MD5
7d9bb2ececc9ede29cd35bdc42a7e22c
)
set
(
KINETO_URL
set
(
KINETO_URL
https://github.com/pytorch/kineto/archive/ff8dba20499a660650632952be76450bd70a52a6.zip
)
https://github.com/pytorch/kineto/archive/ff8dba20499a660650632952be76450bd70a52a6.zip
)
...
...
cmake/third_party.cmake
View file @
f262efc9
...
@@ -175,6 +175,8 @@ if (BUILD_ROCM)
...
@@ -175,6 +175,8 @@ if (BUILD_ROCM)
add_definitions
(
-D__HIP_PLATFORM_HCC__
)
add_definitions
(
-D__HIP_PLATFORM_HCC__
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024"
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
-D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024"
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
-D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024"
)
set
(
CMAKE_CXX_FLAGS_DEBUG
"
${
CMAKE_CXX_FLAGS_DEBUG
}
-mcmodel=large"
)
set
(
CMAKE_C_FLAGS_DEBUG
"
${
CMAKE_C_FLAGS_DEBUG
}
-mcmodel=large"
)
list
(
APPEND oneflow_third_party_libs hip::device
)
list
(
APPEND oneflow_third_party_libs hip::device
)
list
(
APPEND oneflow_third_party_libs roc::hipblas
)
list
(
APPEND oneflow_third_party_libs roc::hipblas
)
list
(
APPEND oneflow_third_party_libs hip::hipcub
)
list
(
APPEND oneflow_third_party_libs hip::hipcub
)
...
...
oneflow/core/profiler/event.cpp
View file @
f262efc9
/*
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License.
limitations under the License.
*/
*/
// #include "fmt/core.h"
#include "fmt/core.h"
// #include "fmt/format.h"
#include "fmt/format.h"
#include "oneflow/core/profiler/event.h"
#include "oneflow/core/profiler/event.h"
#include "oneflow/core/profiler/util.h"
#include "oneflow/core/profiler/util.h"
using
json
=
nlohmann
::
json
;
using
json
=
nlohmann
::
json
;
namespace
oneflow
{
namespace
oneflow
{
namespace
profiler
{
namespace
profiler
{
nlohmann
::
json
IEvent
::
ToJson
()
{
nlohmann
::
json
IEvent
::
ToJson
()
{
return
json
{{
"name"
,
name_
},
{
"time"
,
GetDuration
<
double
>
()},
{
"input_shapes"
,
"-"
}};
return
json
{{
"name"
,
name_
},
{
"time"
,
GetDuration
<
double
>
()},
{
"input_shapes"
,
"-"
}};
}
}
void
IEvent
::
SetStartedAt
(
double
t
)
{
started_at_
=
t
;
}
void
IEvent
::
SetStartedAt
(
double
t
)
{
started_at_
=
t
;
}
void
IEvent
::
SetFinishedAt
(
double
t
)
{
finished_at_
=
t
;
}
void
IEvent
::
SetFinishedAt
(
double
t
)
{
finished_at_
=
t
;
}
void
IEvent
::
Start
()
{
SetStartedAt
(
GetTimeNow
());
}
void
IEvent
::
Start
()
{
SetStartedAt
(
GetTimeNow
());
}
void
IEvent
::
Finish
()
{
SetFinishedAt
(
GetTimeNow
());
}
void
IEvent
::
Finish
()
{
SetFinishedAt
(
GetTimeNow
());
}
bool
IEvent
::
IsChildOf
(
const
IEvent
*
e
)
{
bool
IEvent
::
IsChildOf
(
const
IEvent
*
e
)
{
if
(
!
e
)
{
return
false
;
}
if
(
!
e
)
{
return
false
;
}
if
(
this
==
e
)
{
return
false
;
}
if
(
this
==
e
)
{
return
false
;
}
return
GetStartedAt
<
double
>
()
>=
e
->
GetStartedAt
<
double
>
()
return
GetStartedAt
<
double
>
()
>=
e
->
GetStartedAt
<
double
>
()
&&
GetFinishedAt
<
double
>
()
<=
e
->
GetFinishedAt
<
double
>
();
&&
GetFinishedAt
<
double
>
()
<=
e
->
GetFinishedAt
<
double
>
();
}
}
const
std
::
string
&
IEvent
::
GetName
()
const
{
return
name_
;
}
const
std
::
string
&
IEvent
::
GetName
()
const
{
return
name_
;
}
std
::
string
CustomEvent
::
Key
()
{
return
name_
;
}
std
::
string
CustomEvent
::
Key
()
{
return
name_
;
}
nlohmann
::
json
CustomEvent
::
ToJson
()
{
nlohmann
::
json
CustomEvent
::
ToJson
()
{
auto
j
=
IEvent
::
ToJson
();
auto
j
=
IEvent
::
ToJson
();
j
[
"type"
]
=
EventType
::
kCustom
;
j
[
"type"
]
=
EventType
::
kCustom
;
j
[
"custom_type"
]
=
type_
;
j
[
"custom_type"
]
=
type_
;
return
j
;
return
j
;
}
}
std
::
shared_ptr
<
CustomEvent
>
CustomEvent
::
Create
(
const
std
::
string
&
name
,
CustomEventType
type
)
{
std
::
shared_ptr
<
CustomEvent
>
CustomEvent
::
Create
(
const
std
::
string
&
name
,
CustomEventType
type
)
{
return
std
::
shared_ptr
<
CustomEvent
>
(
new
CustomEvent
(
name
,
type
));
return
std
::
shared_ptr
<
CustomEvent
>
(
new
CustomEvent
(
name
,
type
));
}
}
// std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
std
::
string
KernelEvent
::
Key
()
{
return
fmt
::
format
(
"{}.{}"
,
name_
,
GetFormatedInputShapes
());
}
std
::
string
KernelEvent
::
Key
()
{
return
"yuguo"
;
}
nlohmann
::
json
KernelEvent
::
ToJson
()
{
nlohmann
::
json
KernelEvent
::
ToJson
()
{
auto
j
=
IEvent
::
ToJson
();
auto
j
=
IEvent
::
ToJson
();
j
[
"type"
]
=
EventType
::
kOneflowKernel
;
j
[
"type"
]
=
EventType
::
kOneflowKernel
;
j
[
"input_shapes"
]
=
GetFormatedInputShapes
();
j
[
"input_shapes"
]
=
GetFormatedInputShapes
();
#if defined(WITH_CUDA) || defined(WITH_ROCM)
#if defined(WITH_CUDA)
j
[
"memory_size"
]
=
memory_size_
;
j
[
"memory_size"
]
=
memory_size_
;
if
(
!
children_
.
empty
())
{
j
[
"children"
]
=
children_
;
}
if
(
!
children_
.
empty
())
{
j
[
"children"
]
=
children_
;
}
#endif // WITH_CUDA
#endif // WITH_CUDA
return
j
;
return
j
;
}
}
std
::
shared_ptr
<
KernelEvent
>
KernelEvent
::
Create
(
std
::
shared_ptr
<
KernelEvent
>
KernelEvent
::
Create
(
const
std
::
string
&
name
,
const
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>&
shape_getter
)
{
const
std
::
string
&
name
,
const
std
::
function
<
std
::
vector
<
ShapeView
>
(
void
)
>&
shape_getter
)
{
return
std
::
shared_ptr
<
KernelEvent
>
(
new
KernelEvent
(
name
,
shape_getter
));
return
std
::
shared_ptr
<
KernelEvent
>
(
new
KernelEvent
(
name
,
shape_getter
));
}
}
std
::
string
KernelEvent
::
GetFormatedInputShapes
(
size_t
max_num_to_format
)
{
void
KernelEvent
::
RecordShape
(
const
ShapeView
&
shape
)
{
input_shapes_
.
emplace_back
(
shape
);
}
if
(
input_shapes_
.
size
()
==
0
)
{
return
"-"
;
}
std
::
vector
<
std
::
string
>
shapes_formated
(
std
::
min
(
input_shapes_
.
size
(),
max_num_to_format
));
std
::
string
KernelEvent
::
GetFormatedInputShapes
(
size_t
max_num_to_format
)
{
for
(
auto
i
=
0
;
i
<
shapes_formated
.
size
();
++
i
)
{
if
(
input_shapes_
.
size
()
==
0
)
{
return
"-"
;
}
const
std
::
string
current_shape
=
input_shapes_
[
i
].
ToString
();
std
::
vector
<
std
::
string
>
shapes_formated
(
std
::
min
(
input_shapes_
.
size
(),
max_num_to_format
));
shapes_formated
[
i
]
=
current_shape
==
"()"
?
"scalar"
:
current_shape
;
for
(
auto
i
=
0
;
i
<
shapes_formated
.
size
();
++
i
)
{
}
const
std
::
string
current_shape
=
input_shapes_
[
i
].
ToString
();
if
(
input_shapes_
.
size
()
>
max_num_to_format
)
{
shapes_formated
.
emplace_back
(
"..."
);
}
shapes_formated
[
i
]
=
current_shape
==
"()"
?
"scalar"
:
current_shape
;
return
fmt
::
format
(
"[{}]"
,
fmt
::
join
(
shapes_formated
,
", "
));
}
}
if
(
input_shapes_
.
size
()
>
max_num_to_format
)
{
shapes_formated
.
emplace_back
(
"..."
);
}
// return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
}
// namespace profiler
return
"yuguo"
;
}
}
// namespace profiler
}
// namespace oneflow
}
// namespace oneflow
\ No newline at end of file
oneflow/core/profiler/event.h
View file @
f262efc9
/*
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License.
limitations under the License.
*/
*/
#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
#define ONEFLOW_CORE_PROFILER_EVENT_H_
#define ONEFLOW_CORE_PROFILER_EVENT_H_
#include <functional>
#include <functional>
#include <memory>
#include <memory>
#include <vector>
#include <vector>
#include "nlohmann/json.hpp"
#include "nlohmann/json.hpp"
#include "oneflow/core/common/util.h"
#include "oneflow/core/common/util.h"
#include "oneflow/core/common/shape_view.h"
#include "oneflow/core/common/shape_view.h"
namespace
oneflow
{
namespace
oneflow
{
namespace
profiler
{
namespace
profiler
{
class
ProfileManager
;
class
ProfileManager
;
enum
class
EventType
{
enum
class
EventType
{
kCustom
,
// has three kinds
kCustom
,
// has three kinds
kOneflowKernel
// OneFlow cpu/cuda kernel
kOneflowKernel
// OneFlow cpu/cuda kernel
};
};
enum
class
CustomEventType
{
enum
class
CustomEventType
{
kDefault
,
// for record_function
kDefault
,
// for record_function
kCudaKernel
,
// cuda kernel
kCudaKernel
,
// cuda kernel
kCudaRuntime
// something like cudaLaunchKernel
kCudaRuntime
// something like cudaLaunchKernel
};
};
enum
class
EventTimeUnit
{
kNS
,
kUS
};
enum
class
EventTimeUnit
{
kNS
,
kUS
};
class
IEvent
{
class
IEvent
{
public:
public:
OF_DISALLOW_COPY_AND_MOVE
(
IEvent
);
OF_DISALLOW_COPY_AND_MOVE
(
IEvent
);
IEvent
()
=
delete
;
IEvent
()
=
delete
;
IEvent
(
const
std
::
string
&
name
,
EventTimeUnit
time_unit
)
:
name_
(
name
),
time_unit_
(
time_unit
)
{}
IEvent
(
const
std
::
string
&
name
,
EventTimeUnit
time_unit
)
:
name_
(
name
),
time_unit_
(
time_unit
)
{}
virtual
std
::
string
Key
()
=
0
;
virtual
std
::
string
Key
()
=
0
;
virtual
nlohmann
::
json
ToJson
();
virtual
nlohmann
::
json
ToJson
();
virtual
~
IEvent
()
=
default
;
virtual
~
IEvent
()
=
default
;
virtual
void
Start
();
virtual
void
Start
();
virtual
void
Finish
();
virtual
void
Finish
();
bool
IsChildOf
(
const
IEvent
*
e
);
bool
IsChildOf
(
const
IEvent
*
e
);
const
std
::
string
&
GetName
()
const
;
const
std
::
string
&
GetName
()
const
;
template
<
typename
T
>
template
<
typename
T
>
const
T
GetDuration
(
EventTimeUnit
time_unit
=
EventTimeUnit
::
kUS
)
const
;
const
T
GetDuration
(
EventTimeUnit
time_unit
=
EventTimeUnit
::
kUS
)
const
;
template
<
typename
T
>
template
<
typename
T
>
const
T
GetStartedAt
(
EventTimeUnit
time_unit
=
EventTimeUnit
::
kUS
)
const
;
const
T
GetStartedAt
(
EventTimeUnit
time_unit
=
EventTimeUnit
::
kUS
)
const
;
template
<
typename
T
>
template
<
typename
T
>
const
T
GetFinishedAt
(
EventTimeUnit
time_unit
=
EventTimeUnit
::
kUS
)
const
;
const
T
GetFinishedAt
(
EventTimeUnit
time_unit
=
EventTimeUnit
::
kUS
)
const
;
protected:
protected:
virtual
void
SetStartedAt
(
double
t
);
virtual
void
SetStartedAt
(
double
t
);
virtual
void
SetFinishedAt
(
double
t
);
virtual
void
SetFinishedAt
(
double
t
);
std
::
string
name_
;
std
::
string
name_
;
EventTimeUnit
time_unit_
;
EventTimeUnit
time_unit_
;
double
started_at_
=
0
;
double
started_at_
=
0
;
double
finished_at_
=
0
;
double
finished_at_
=
0
;
};
};
inline
double
ConvertTime
(
double
time_
,
EventTimeUnit
src_time_unit
,
EventTimeUnit
dst_time_unit
)
{
inline
double
ConvertTime
(
double
time_
,
EventTimeUnit
src_time_unit
,
EventTimeUnit
dst_time_unit
)
{
if
(
src_time_unit
==
EventTimeUnit
::
kNS
&&
dst_time_unit
==
EventTimeUnit
::
kUS
)
{
if
(
src_time_unit
==
EventTimeUnit
::
kNS
&&
dst_time_unit
==
EventTimeUnit
::
kUS
)
{
return
time_
/
1000
;
return
time_
/
1000
;
}
}
if
(
src_time_unit
==
EventTimeUnit
::
kUS
&&
dst_time_unit
==
EventTimeUnit
::
kNS
)
{
if
(
src_time_unit
==
EventTimeUnit
::
kUS
&&
dst_time_unit
==
EventTimeUnit
::
kNS
)
{
return
time_
*
1000
;
return
time_
*
1000
;
}
}
return
time_
;
return
time_
;
}
}
template
<
>
template
<
>
const
inline
double
IEvent
::
GetStartedAt
<
double
>
(
EventTimeUnit
time_unit
)
const
{
const
inline
double
IEvent
::
GetStartedAt
<
double
>
(
EventTimeUnit
time_unit
)
const
{
return
ConvertTime
(
started_at_
,
time_unit_
,
time_unit
);
return
ConvertTime
(
started_at_
,
time_unit_
,
time_unit
);
}
}
template
<
>
template
<
>
const
inline
time_t
IEvent
::
GetStartedAt
<
time_t
>
(
EventTimeUnit
time_unit
)
const
{
const
inline
time_t
IEvent
::
GetStartedAt
<
time_t
>
(
EventTimeUnit
time_unit
)
const
{
return
static_cast
<
time_t
>
(
GetStartedAt
<
double
>
(
time_unit
));
return
static_cast
<
time_t
>
(
GetStartedAt
<
double
>
(
time_unit
));
}
}
template
<
>
template
<
>
const
inline
double
IEvent
::
GetFinishedAt
<
double
>
(
EventTimeUnit
time_unit
)
const
{
const
inline
double
IEvent
::
GetFinishedAt
<
double
>
(
EventTimeUnit
time_unit
)
const
{
return
ConvertTime
(
finished_at_
,
time_unit_
,
time_unit
);
return
ConvertTime
(
finished_at_
,
time_unit_
,
time_unit
);
}
}
template
<
>
template
<
>
const
inline
time_t
IEvent
::
GetFinishedAt
<
time_t
>
(
EventTimeUnit
time_unit
)
const
{
const
inline
time_t
IEvent
::
GetFinishedAt
<
time_t
>
(
EventTimeUnit
time_unit
)
const
{
return
static_cast
<
time_t
>
(
GetFinishedAt
<
double
>
(
time_unit
));
return
static_cast
<
time_t
>
(
GetFinishedAt
<
double
>
(
time_unit
));
}
}
template
<
>
template
<
>
const
inline
double
IEvent
::
GetDuration
<
double
>
(
EventTimeUnit
time_unit
)
const
{
const
inline
double
IEvent
::
GetDuration
<
double
>
(
EventTimeUnit
time_unit
)
const
{
return
GetFinishedAt
<
double
>
(
time_unit
)
-
GetStartedAt
<
double
>
(
time_unit
);
return
GetFinishedAt
<
double
>
(
time_unit
)
-
GetStartedAt
<
double
>
(
time_unit
);
}
}
template
<
>
template
<
>
const
inline
time_t
IEvent
::
GetDuration
<
time_t
>
(
EventTimeUnit
time_unit
)
const
{
const
inline
time_t
IEvent
::
GetDuration
<
time_t
>
(
EventTimeUnit
time_unit
)
const
{
return
static_cast
<
time_t
>
(
GetDuration
<
double
>
(
time_unit
));
return
static_cast
<
time_t
>
(
GetDuration
<
double
>
(
time_unit
));
}
}
class
CustomEvent
final
:
public
IEvent
{
class
CustomEvent
final
:
public
IEvent
{
public:
public:
friend
class
ProfileManager
;
friend
class
ProfileManager
;
std
::
string
Key
()
override
;
std
::
string
Key
()
override
;
nlohmann
::
json
ToJson
()
override
;
nlohmann
::
json
ToJson
()
override
;
static
std
::
shared_ptr
<
CustomEvent
>
Create
(
const
std
::
string
&
name
,
static
std
::
shared_ptr
<
CustomEvent
>
Create
(
const
std
::
string
&
name
,
CustomEventType
type
=
CustomEventType
::
kDefault
);
CustomEventType
type
=
CustomEventType
::
kDefault
);
private:
private:
CustomEventType
type_
;
CustomEventType
type_
;
CustomEvent
(
const
std
::
string
&
custom_name
,
CustomEventType
type
)
CustomEvent
(
const
std
::
string
&
custom_name
,
CustomEventType
type
)
:
IEvent
(
custom_name
,
:
IEvent
(
custom_name
,
type
==
CustomEventType
::
kDefault
?
EventTimeUnit
::
kNS
:
EventTimeUnit
::
kUS
),
type
==
CustomEventType
::
kDefault
?
EventTimeUnit
::
kNS
:
EventTimeUnit
::
kUS
),
type_
(
type
)
{}
type_
(
type
)
{}
};
};
class
KernelEvent
final
:
public
IEvent
{
class
KernelEvent
final
:
public
IEvent
{
public:
public:
std
::
string
Key
()
override
;
std
::
string
Key
()
override
;
nlohmann
::
json
ToJson
()
override
;
nlohmann
::
json
ToJson
()
override
;
static
std
::
shared_ptr
<
KernelEvent
>
Create
(
static
std
::
shared_ptr
<
KernelEvent
>
Create
(
const
std
::
string
&
name
,
const
std
::
function
<
std
::
vector
<
ShapeView
>
(
void
)
>&
shape_getter
);
const
std
::
string
&
name
,
const
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>&
shape_getter
);
void
RecordShape
(
const
ShapeView
&
shape
);
#if defined(WITH_CUDA) || defined(WITH_ROCM)
void
SetMemorySize
(
int64_t
memory_size
)
{
memory_size_
=
memory_size
;
}
#if defined(WITH_CUDA)
void
AddChildEvent
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
children_
.
emplace
(
e
);
}
void
SetMemorySize
(
int64_t
memory_size
)
{
memory_size_
=
memory_size
;
}
bool
AddChildEventIfSo
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
void
AddChildEvent
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
children_
.
emplace
(
e
);
}
if
(
e
->
IsChildOf
(
dynamic_cast
<
IEvent
*>
(
this
)))
{
bool
AddChildEventIfSo
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
children_
.
emplace
(
e
);
if
(
e
->
IsChildOf
(
dynamic_cast
<
IEvent
*>
(
this
)))
{
return
true
;
children_
.
emplace
(
e
);
}
return
true
;
return
false
;
}
}
return
false
;
bool
HasChildEvent
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
return
children_
.
count
(
e
);
}
}
void
WalkAmongChildren
(
const
std
::
function
<
void
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
>&
f
)
const
{
bool
HasChildEvent
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
return
children_
.
count
(
e
);
}
for
(
const
auto
&
x
:
children_
)
{
f
(
x
);
}
void
WalkAmongChildren
(
const
std
::
function
<
void
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
>&
f
)
const
{
}
for
(
const
auto
&
x
:
children_
)
{
f
(
x
);
}
#endif // WITH_CUDA
}
#endif // WITH_CUDA
private:
KernelEvent
(
const
std
::
string
&
kernel_name
,
private:
const
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>&
shape_getter
)
KernelEvent
(
const
std
::
string
&
kernel_name
,
:
IEvent
(
kernel_name
,
EventTimeUnit
::
kNS
)
{
const
std
::
function
<
std
::
vector
<
ShapeView
>
(
void
)
>&
shape_getter
)
if
(
shape_getter
)
{
input_shapes_
=
shape_getter
();
}
:
IEvent
(
kernel_name
,
EventTimeUnit
::
kNS
)
{
}
if
(
shape_getter
)
{
input_shapes_
=
shape_getter
();
}
}
#if defined(WITH_CUDA) || defined(WITH_ROCM)
int64_t
memory_size_
=
-
1
;
#if defined(WITH_CUDA)
std
::
set
<
std
::
shared_ptr
<
IEvent
>>
children_
;
int64_t
memory_size_
=
-
1
;
#endif // WITH_CUDA
std
::
set
<
std
::
shared_ptr
<
IEvent
>>
children_
;
#endif // WITH_CUDA
std
::
vector
<
Shape
>
input_shapes_
;
std
::
string
GetFormatedInputShapes
(
size_t
max_num_to_format
=
4
);
std
::
vector
<
ShapeView
>
input_shapes_
;
};
std
::
string
GetFormatedInputShapes
(
size_t
max_num_to_format
=
4
);
};
}
// namespace profiler
}
// namespace oneflow
}
// namespace profiler
}
// namespace oneflow
namespace
nlohmann
{
namespace
nlohmann
{
inline
void
to_json
(
json
&
j
,
const
std
::
shared_ptr
<::
oneflow
::
profiler
::
IEvent
>&
event
)
{
j
=
event
->
ToJson
();
inline
void
to_json
(
json
&
j
,
const
std
::
shared_ptr
<::
oneflow
::
profiler
::
IEvent
>&
event
)
{
}
j
=
event
->
ToJson
();
}
}
// namespace nlohmann
}
// namespace nlohmann
#endif // ONEFLOW_CORE_PROFILER_EVENT_H_
#endif // ONEFLOW_CORE_PROFILER_EVENT_H_
oneflow/core/profiler/event_recorder.cpp
View file @
f262efc9
...
@@ -32,13 +32,13 @@ std::shared_ptr<EventRecorder> EventRecorder::CreateCustomEventRecorder(const st
...
@@ -32,13 +32,13 @@ std::shared_ptr<EventRecorder> EventRecorder::CreateCustomEventRecorder(const st
Maybe
<
EventRecorder
>
EventRecorder
::
CreateKernelEventRecorder
(
Maybe
<
EventRecorder
>
EventRecorder
::
CreateKernelEventRecorder
(
const
std
::
string
&
name
,
const
std
::
string
&
name
,
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
const
std
::
function
<
int64_t
()
>&
memory_size_getter
,
const
std
::
function
<
int64_t
()
>&
memory_size_getter
,
#endif
#endif
const
ShapeGetterFuncType
&
shape_getter
)
{
const
ShapeGetterFuncType
&
shape_getter
)
{
auto
pmgr
=
Singleton
<
ProfileManager
>::
Get
();
auto
pmgr
=
Singleton
<
ProfileManager
>::
Get
();
if
(
pmgr
)
{
if
(
pmgr
)
{
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
if
(
pmgr
->
use_cpu_
||
pmgr
->
use_cuda_
)
{
if
(
pmgr
->
use_cpu_
||
pmgr
->
use_cuda_
)
{
auto
event
=
KernelEvent
::
Create
(
name
,
pmgr
->
record_shapes_
?
shape_getter
:
nullptr
);
auto
event
=
KernelEvent
::
Create
(
name
,
pmgr
->
record_shapes_
?
shape_getter
:
nullptr
);
if
(
pmgr
->
use_cuda_
)
{
if
(
pmgr
->
use_cuda_
)
{
...
...
oneflow/core/profiler/event_recorder.h
View file @
f262efc9
/*
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License.
limitations under the License.
*/
*/
#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#include "oneflow/core/common/util.h"
#include "oneflow/core/common/util.h"
#include "oneflow/core/profiler/event.h"
#include "oneflow/core/profiler/event.h"
namespace
oneflow
{
namespace
oneflow
{
namespace
profiler
{
namespace
profiler
{
class
EventRecorder
{
class
EventRecorder
{
public:
public:
using
ShapeGetterFuncType
=
std
::
function
<
std
::
vector
<
Shape
View
>
(
void
)
>
;
using
ShapeGetterFuncType
=
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>
;
OF_DISALLOW_COPY_AND_MOVE
(
EventRecorder
);
OF_DISALLOW_COPY_AND_MOVE
(
EventRecorder
);
explicit
EventRecorder
(
const
std
::
shared_ptr
<
IEvent
>&
event
)
:
event_
(
event
)
{
explicit
EventRecorder
(
const
std
::
shared_ptr
<
IEvent
>&
event
)
:
event_
(
event
)
{
CHECK_JUST
(
RegisterEventToProfileManager
(
event
));
CHECK_JUST
(
RegisterEventToProfileManager
(
event
));
event_
->
Start
();
event_
->
Start
();
}
}
Maybe
<
void
>
RegisterEventToProfileManager
(
const
std
::
shared_ptr
<
IEvent
>&
event
);
Maybe
<
void
>
RegisterEventToProfileManager
(
const
std
::
shared_ptr
<
IEvent
>&
event
);
~
EventRecorder
()
{
~
EventRecorder
()
{
if
(
event_
)
{
if
(
event_
)
{
event_
->
Finish
();
event_
->
Finish
();
event_
.
reset
();
event_
.
reset
();
}
}
}
}
static
std
::
shared_ptr
<
EventRecorder
>
CreateCustomEventRecorder
(
const
std
::
string
&
name
);
static
std
::
shared_ptr
<
EventRecorder
>
CreateCustomEventRecorder
(
const
std
::
string
&
name
);
static
Maybe
<
EventRecorder
>
CreateKernelEventRecorder
(
static
Maybe
<
EventRecorder
>
CreateKernelEventRecorder
(
const
std
::
string
&
name
,
const
std
::
string
&
name
,
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
const
std
::
function
<
int64_t
()
>&
memory_size_getter
,
const
std
::
function
<
int64_t
()
>&
memory_size_getter
,
#endif
#endif
const
ShapeGetterFuncType
&
shape_getter
);
const
ShapeGetterFuncType
&
shape_getter
);
private:
private:
std
::
shared_ptr
<
IEvent
>
event_
;
std
::
shared_ptr
<
IEvent
>
event_
;
};
};
}
// namespace profiler
}
// namespace profiler
}
// namespace oneflow
}
// namespace oneflow
#endif // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#endif // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
oneflow/core/profiler/kernel.cpp
View file @
f262efc9
...
@@ -17,7 +17,11 @@ limitations under the License.
...
@@ -17,7 +17,11 @@ limitations under the License.
#include "oneflow/core/profiler/kernel.h"
#include "oneflow/core/profiler/kernel.h"
#include "oneflow/core/profiler/profiler.h"
#include "oneflow/core/profiler/profiler.h"
#include "oneflow/core/kernel/kernel.h"
#include "oneflow/core/kernel/kernel.h"
#ifdef WITH_ROCM
#include "oneflow/core/ep/rocm/cuda_stream.h"
#else
#include "oneflow/core/ep/cuda/cuda_stream.h"
#include "oneflow/core/ep/cuda/cuda_stream.h"
#endif
#include "oneflow/core/lazy/actor/actor_context.h"
#include "oneflow/core/lazy/actor/actor_context.h"
namespace
oneflow
{
namespace
oneflow
{
...
@@ -43,6 +47,11 @@ thread_local cudaEvent_t cuda_memory_bandwidth_profile_start_event = nullptr;
...
@@ -43,6 +47,11 @@ thread_local cudaEvent_t cuda_memory_bandwidth_profile_start_event = nullptr;
thread_local
cudaEvent_t
cuda_memory_bandwidth_profile_end_event
=
nullptr
;
thread_local
cudaEvent_t
cuda_memory_bandwidth_profile_end_event
=
nullptr
;
#endif // WITH_CUDA
#endif // WITH_CUDA
#if defined(WITH_ROCM)
thread_local
hipEvent_t
cuda_memory_bandwidth_profile_start_event
=
nullptr
;
thread_local
hipEvent_t
cuda_memory_bandwidth_profile_end_event
=
nullptr
;
#endif // WITH_ROCM
}
// namespace
}
// namespace
void
TraceKernelForwardDataContentStart
(
KernelContext
*
kernel_ctx
,
const
Kernel
*
kernel
)
{
void
TraceKernelForwardDataContentStart
(
KernelContext
*
kernel_ctx
,
const
Kernel
*
kernel
)
{
...
@@ -61,6 +70,22 @@ void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel*
...
@@ -61,6 +70,22 @@ void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel*
}
}
if
(
profile_kernel_forward_range
)
{
OF_PROFILER_RANGE_PUSH
(
kernel
->
op_conf
().
name
());
}
if
(
profile_kernel_forward_range
)
{
OF_PROFILER_RANGE_PUSH
(
kernel
->
op_conf
().
name
());
}
#endif // WITH_CUDA
#endif // WITH_CUDA
#if defined(WITH_ROCM)
if
(
profile_cuda_memory_bandwidth
)
{
auto
*
actor_context_provider
=
dynamic_cast
<
ActorContextProvider
*>
(
kernel_ctx
);
auto
*
cuda_stream
=
dynamic_cast
<
ep
::
CudaStream
*>
(
kernel_ctx
->
stream
());
if
(
cuda_stream
!=
nullptr
&&
actor_context_provider
!=
nullptr
)
{
CHECK
(
cuda_memory_bandwidth_profile_start_event
==
nullptr
);
CHECK
(
cuda_memory_bandwidth_profile_end_event
==
nullptr
);
OF_CUDA_CHECK
(
hipEventCreate
(
&
cuda_memory_bandwidth_profile_start_event
));
OF_CUDA_CHECK
(
hipEventCreate
(
&
cuda_memory_bandwidth_profile_end_event
));
OF_CUDA_CHECK
(
hipEventRecord
(
cuda_memory_bandwidth_profile_start_event
,
cuda_stream
->
cuda_stream
()));
}
}
if
(
profile_kernel_forward_range
)
{
OF_PROFILER_RANGE_PUSH
(
kernel
->
op_conf
().
name
());
}
#endif // WITH_ROCM
}
}
void
TraceKernelForwardDataContentEnd
(
KernelContext
*
kernel_ctx
,
const
Kernel
*
kernel
)
{
void
TraceKernelForwardDataContentEnd
(
KernelContext
*
kernel_ctx
,
const
Kernel
*
kernel
)
{
...
@@ -103,6 +128,45 @@ void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* k
...
@@ -103,6 +128,45 @@ void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* k
}
}
}
}
#endif // WITH_CUDA
#endif // WITH_CUDA
#if defined(WITH_ROCM)
if
(
profile_kernel_forward_range
)
{
OF_PROFILER_RANGE_POP
();
}
// The memory bandwidth profiler only works in lazy mode.
if
(
profile_cuda_memory_bandwidth
)
{
auto
*
cuda_stream
=
dynamic_cast
<
ep
::
CudaStream
*>
(
kernel_ctx
->
stream
());
auto
*
actor_context_provider
=
dynamic_cast
<
ActorContextProvider
*>
(
kernel_ctx
);
if
(
cuda_stream
!=
nullptr
&&
actor_context_provider
!=
nullptr
)
{
hipEvent_t
start_event
=
cuda_memory_bandwidth_profile_start_event
;
hipEvent_t
end_event
=
cuda_memory_bandwidth_profile_end_event
;
cuda_memory_bandwidth_profile_start_event
=
nullptr
;
cuda_memory_bandwidth_profile_end_event
=
nullptr
;
CHECK_NOTNULL
(
start_event
);
CHECK_NOTNULL
(
end_event
);
OF_CUDA_CHECK
(
hipEventRecord
(
end_event
,
cuda_stream
->
cuda_stream
()));
int64_t
memory_size
=
0
;
for
(
const
auto
&
bn
:
kernel
->
op_attribute
().
input_bns
())
{
const
Blob
*
blob
=
kernel_ctx
->
BnInOp2Blob
(
bn
);
if
(
blob
)
{
memory_size
+=
blob
->
ByteSizeOfBlobBody
();
}
}
for
(
const
auto
&
bn
:
kernel
->
op_attribute
().
output_bns
())
{
const
Blob
*
blob
=
kernel_ctx
->
BnInOp2Blob
(
bn
);
if
(
blob
)
{
memory_size
+=
blob
->
ByteSizeOfBlobBody
();
}
}
const
std
::
string
op_name
=
kernel
->
op_conf
().
name
();
actor_context_provider
->
GetActorContext
()
->
AddCallback
(
[
start_event
,
end_event
,
memory_size
,
op_name
]()
{
float
elapsed_ms
=
0
;
OF_CUDA_CHECK
(
hipEventElapsedTime
(
&
elapsed_ms
,
start_event
,
end_event
));
OF_CUDA_CHECK
(
hipEventDestroy
(
start_event
));
OF_CUDA_CHECK
(
hipEventDestroy
(
end_event
));
double
bandwidth
=
static_cast
<
double
>
(
memory_size
)
/
(
1024.0
*
1024.0
*
1024.0
)
/
(
elapsed_ms
/
1000
);
LOG
(
INFO
)
<<
"PROFILER::KERNEL::CUDA_MEMORY_BANDWIDTH op_name: "
<<
op_name
<<
" elapsed(ms): "
<<
elapsed_ms
<<
" memory_size(Byte): "
<<
memory_size
<<
" bandwidth(GB/s): "
<<
bandwidth
;
});
}
}
#endif // WITH_ROCM
}
}
}
// namespace profiler
}
// namespace profiler
...
...
oneflow/core/profiler/kineto_shim.cpp
View file @
f262efc9
...
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
...
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
limitations under the License.
*/
*/
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
#include "oneflow/core/profiler/kineto_shim.h"
#include "oneflow/core/profiler/kineto_shim.h"
#include "libkineto.h"
#include "libkineto.h"
...
...
oneflow/core/profiler/kineto_shim.h
View file @
f262efc9
...
@@ -16,7 +16,7 @@ limitations under the License.
...
@@ -16,7 +16,7 @@ limitations under the License.
#ifndef ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
#ifndef ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
#define ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
#define ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
#include <string>
#include <string>
#include <memory>
#include <memory>
...
...
oneflow/core/profiler/profile_manager.cpp
View file @
f262efc9
...
@@ -15,12 +15,12 @@ limitations under the License.
...
@@ -15,12 +15,12 @@ limitations under the License.
*/
*/
#include <memory>
#include <memory>
#include <unordered_map>
#include <unordered_map>
//
#include "fmt/core.h"
#include "fmt/core.h"
#include "nlohmann/json.hpp"
#include "nlohmann/json.hpp"
#include "oneflow/core/profiler/kineto_shim.h"
#include "oneflow/core/profiler/kineto_shim.h"
#include "oneflow/core/profiler/profile_manager.h"
#include "oneflow/core/profiler/profile_manager.h"
#include "oneflow/core/profiler/event.h"
#include "oneflow/core/profiler/event.h"
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
#include <libkineto.h>
#include <libkineto.h>
#endif // WITH_CUDA
#endif // WITH_CUDA
...
@@ -48,7 +48,7 @@ std::string ProfileManager::DumpResultsJson() {
...
@@ -48,7 +48,7 @@ std::string ProfileManager::DumpResultsJson() {
}
}
std
::
vector
<
std
::
shared_ptr
<
IEvent
>>
ProfileManager
::
ExportEvents
()
{
std
::
vector
<
std
::
shared_ptr
<
IEvent
>>
ProfileManager
::
ExportEvents
()
{
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
auto
trace
=
StopTrace
();
auto
trace
=
StopTrace
();
const
auto
&
kineto_events
=
*
(
trace
.
get
()
->
activities
());
const
auto
&
kineto_events
=
*
(
trace
.
get
()
->
activities
());
std
::
set
<
std
::
shared_ptr
<
IEvent
>>
custom_events
;
std
::
set
<
std
::
shared_ptr
<
IEvent
>>
custom_events
;
...
@@ -77,7 +77,7 @@ std::vector<std::shared_ptr<IEvent>> ProfileManager::ExportEvents() {
...
@@ -77,7 +77,7 @@ std::vector<std::shared_ptr<IEvent>> ProfileManager::ExportEvents() {
while
(
!
events_
.
empty
())
{
while
(
!
events_
.
empty
())
{
auto
evt
=
events_
.
front
();
auto
evt
=
events_
.
front
();
events_
.
pop
();
events_
.
pop
();
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
auto
evt_kernel
=
std
::
dynamic_pointer_cast
<
KernelEvent
>
(
evt
);
auto
evt_kernel
=
std
::
dynamic_pointer_cast
<
KernelEvent
>
(
evt
);
if
(
evt_kernel
)
{
if
(
evt_kernel
)
{
std
::
set
<
int64_t
>
current_corr_ids
;
std
::
set
<
int64_t
>
current_corr_ids
;
...
@@ -106,8 +106,7 @@ std::string ProfileManager::GetNextEventRecorderKey(const std::string& name) {
...
@@ -106,8 +106,7 @@ std::string ProfileManager::GetNextEventRecorderKey(const std::string& name) {
}
else
{
}
else
{
event_recorders_last_id_
[
name
]
++
;
event_recorders_last_id_
[
name
]
++
;
}
}
// return fmt::format("{}.{}", name, event_recorders_last_id_[name]);
return
fmt
::
format
(
"{}.{}"
,
name
,
event_recorders_last_id_
[
name
]);
return
"yuguo"
;
}
}
}
// namespace profiler
}
// namespace profiler
...
...
oneflow/core/profiler/profile_manager.h
View file @
f262efc9
...
@@ -37,7 +37,7 @@ class ProfileManager {
...
@@ -37,7 +37,7 @@ class ProfileManager {
use_cuda_
(
use_cuda
),
use_cuda_
(
use_cuda
),
record_shapes_
(
record_shapes
),
record_shapes_
(
record_shapes
),
record_bandwidth_
(
record_bandwidth
)
{
record_bandwidth_
(
record_bandwidth
)
{
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
|| defined(WITH_ROCM)
std
::
set
<
ActivityType
>
activities
{};
std
::
set
<
ActivityType
>
activities
{};
if
(
use_cpu
)
{
activities
.
insert
(
ActivityType
::
CPU
);
}
if
(
use_cpu
)
{
activities
.
insert
(
ActivityType
::
CPU
);
}
if
(
use_cuda
)
{
activities
.
insert
(
ActivityType
::
CUDA
);
}
if
(
use_cuda
)
{
activities
.
insert
(
ActivityType
::
CUDA
);
}
...
...
oneflow/core/profiler/profiler.cpp
View file @
f262efc9
...
@@ -20,11 +20,20 @@ limitations under the License.
...
@@ -20,11 +20,20 @@ limitations under the License.
#include "oneflow/core/profiler/event_recorder.h"
#include "oneflow/core/profiler/event_recorder.h"
#include "oneflow/core/vm/vm_util.h"
#include "oneflow/core/vm/vm_util.h"
#ifdef OF_ENABLE_PROFILER
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_profile.h>
#include <roctracer_roctx.h>
#include <sys/syscall.h>
#include <iostream>
#include "oneflow/core/device/cuda_util.h"
#else
#include <nvtx3/nvToolsExt.h>
#include <nvtx3/nvToolsExt.h>
#include <sys/syscall.h>
#include <sys/syscall.h>
#include <iostream>
#include <iostream>
#include <cuda_profiler_api.h>
#include <cuda_profiler_api.h>
#include "oneflow/core/device/cuda_util.h"
#include "oneflow/core/device/cuda_util.h"
#endif
#endif // OF_ENABLE_PROFILER
#endif // OF_ENABLE_PROFILER
namespace
oneflow
{
namespace
oneflow
{
...
@@ -33,6 +42,16 @@ namespace profiler {
...
@@ -33,6 +42,16 @@ namespace profiler {
void
NameThisHostThread
(
const
std
::
string
&
name
)
{
void
NameThisHostThread
(
const
std
::
string
&
name
)
{
#ifdef OF_ENABLE_PROFILER
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
static
thread_local
std
::
unique_ptr
<
std
::
string
>
thread_name_prefix
;
if
(
!
thread_name_prefix
)
{
thread_name_prefix
.
reset
(
new
std
::
string
(
GetStringFromEnv
(
"ONEFLOW_PROFILER_HOST_THREAD_NAME_PREFIX"
,
""
)));
}
const
std
::
string
name_with_prefix
=
*
thread_name_prefix
+
name
;
// nvtxNameOsThreadA(syscall(SYS_gettid), name_with_prefix.c_str());
roctxMarkA
(
name_with_prefix
.
c_str
());
#else
static
thread_local
std
::
unique_ptr
<
std
::
string
>
thread_name_prefix
;
static
thread_local
std
::
unique_ptr
<
std
::
string
>
thread_name_prefix
;
if
(
!
thread_name_prefix
)
{
if
(
!
thread_name_prefix
)
{
thread_name_prefix
.
reset
(
thread_name_prefix
.
reset
(
...
@@ -40,18 +59,27 @@ void NameThisHostThread(const std::string& name) {
...
@@ -40,18 +59,27 @@ void NameThisHostThread(const std::string& name) {
}
}
const
std
::
string
name_with_prefix
=
*
thread_name_prefix
+
name
;
const
std
::
string
name_with_prefix
=
*
thread_name_prefix
+
name
;
nvtxNameOsThreadA
(
syscall
(
SYS_gettid
),
name_with_prefix
.
c_str
());
nvtxNameOsThreadA
(
syscall
(
SYS_gettid
),
name_with_prefix
.
c_str
());
#endif
#endif // OF_ENABLE_PROFILER
#endif // OF_ENABLE_PROFILER
}
}
void
RangePush
(
const
std
::
string
&
name
)
{
void
RangePush
(
const
std
::
string
&
name
)
{
#ifdef OF_ENABLE_PROFILER
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
roctxRangePushA
(
name
.
c_str
());
#else
nvtxRangePushA
(
name
.
c_str
());
nvtxRangePushA
(
name
.
c_str
());
#endif
#endif // OF_ENABLE_PROFILER
#endif // OF_ENABLE_PROFILER
}
}
void
RangePop
()
{
void
RangePop
()
{
#ifdef OF_ENABLE_PROFILER
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
roctxRangePop
();
#else
nvtxRangePop
();
nvtxRangePop
();
#endif
#endif // OF_ENABLE_PROFILER
#endif // OF_ENABLE_PROFILER
}
}
...
@@ -82,13 +110,21 @@ void LogHostMemoryUsage(const std::string& name) {
...
@@ -82,13 +110,21 @@ void LogHostMemoryUsage(const std::string& name) {
void
ProfilerStart
()
{
void
ProfilerStart
()
{
#ifdef OF_ENABLE_PROFILER
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
OF_CUDA_CHECK
(
hipProfilerStart
());
#else
OF_CUDA_CHECK
(
cudaProfilerStart
());
OF_CUDA_CHECK
(
cudaProfilerStart
());
#endif
#endif // OF_ENABLE_PROFILER
#endif // OF_ENABLE_PROFILER
}
}
void
ProfilerStop
()
{
void
ProfilerStop
()
{
#ifdef OF_ENABLE_PROFILER
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
OF_CUDA_CHECK
(
hipProfilerStop
());
#else
OF_CUDA_CHECK
(
cudaProfilerStop
());
OF_CUDA_CHECK
(
cudaProfilerStop
());
#endif
#endif // OF_ENABLE_PROFILER
#endif // OF_ENABLE_PROFILER
}
}
...
@@ -105,6 +141,9 @@ Maybe<std::string> DisableProfilerAndReturnResult() {
...
@@ -105,6 +141,9 @@ Maybe<std::string> DisableProfilerAndReturnResult() {
#if defined(WITH_CUDA)
#if defined(WITH_CUDA)
OF_CUDA_CHECK
(
cudaDeviceSynchronize
());
OF_CUDA_CHECK
(
cudaDeviceSynchronize
());
#endif // WITH_CUDA
#endif // WITH_CUDA
#if defined(WITH_ROCM)
OF_CUDA_CHECK
(
hipDeviceSynchronize
());
#endif // WITH_ROCM
auto
*
pmgr
=
JUST
(
SingletonMaybe
<
ProfileManager
>
());
auto
*
pmgr
=
JUST
(
SingletonMaybe
<
ProfileManager
>
());
std
::
string
results
=
pmgr
->
DumpResultsJson
();
std
::
string
results
=
pmgr
->
DumpResultsJson
();
Singleton
<
ProfileManager
>::
Delete
();
Singleton
<
ProfileManager
>::
Delete
();
...
...
oneflow/user/kernels/math_unary_elementwise_func.h
View file @
f262efc9
This diff is collapsed.
Click to expand it.
oneflow/user/kernels/nvtx_range_kernel.hip.cpp
0 → 100644
View file @
f262efc9
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#ifdef OF_ENABLE_PROFILER
#include <roctracer_roctx.h>
#endif // OF_ENABLE_PROFILER
namespace
oneflow
{
namespace
{
#ifdef OF_ENABLE_PROFILER
static
thread_local
HashMap
<
std
::
string
,
roctx_range_id_t
>
mark2range_id
;
#endif
}
// namespace
class
NvtxOpKernelState
final
:
public
user_op
::
OpKernelState
{
public:
NvtxOpKernelState
()
:
counter_
(
0
)
{
#ifndef OF_ENABLE_PROFILER
LOG
(
WARNING
)
<<
"To use NVTX, run cmake with -DBUILD_PROFILER=ON"
;
#endif
}
~
NvtxOpKernelState
()
override
=
default
;
int64_t
counter
()
const
{
return
counter_
;
}
void
IncreaseCount
()
{
counter_
+=
1
;
}
private:
int64_t
counter_
;
};
class
NvtxStartKernel
final
:
public
user_op
::
OpKernel
{
public:
NvtxStartKernel
()
=
default
;
~
NvtxStartKernel
()
override
=
default
;
std
::
shared_ptr
<
user_op
::
OpKernelState
>
CreateOpKernelState
(
user_op
::
KernelInitContext
*
ctx
)
const
override
{
return
std
::
make_shared
<
NvtxOpKernelState
>
();
}
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
,
user_op
::
OpKernelState
*
state
,
const
user_op
::
OpKernelCache
*
)
const
override
{
const
user_op
::
Tensor
*
in
=
ctx
->
Tensor4ArgNameAndIndex
(
"in"
,
0
);
user_op
::
Tensor
*
out
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
const
ShapeView
&
in_shape
=
in
->
shape_view
();
CHECK_EQ
(
out
->
shape_view
(),
in_shape
);
const
DataType
in_data_type
=
in
->
data_type
();
CHECK_EQ
(
out
->
data_type
(),
in_data_type
);
Memcpy
<
DeviceType
::
kCUDA
>
(
ctx
->
stream
(),
out
->
mut_dptr
<
void
>
(),
in
->
dptr
<
void
>
(),
in_shape
.
elem_cnt
()
*
GetSizeOfDataType
(
in_data_type
));
#ifdef OF_ENABLE_PROFILER
auto
*
kernel_state
=
dynamic_cast
<
NvtxOpKernelState
*>
(
state
);
const
std
::
string
mark_prefix
=
ctx
->
Attr
<
std
::
string
>
(
"mark_prefix"
);
const
std
::
string
mark
=
mark_prefix
+
"-"
+
std
::
to_string
(
kernel_state
->
counter
());
roctx_range_id_t
range_id
=
roctxRangeStartA
(
mark
.
c_str
());
CHECK
(
mark2range_id
.
emplace
(
mark
,
range_id
).
second
);
kernel_state
->
IncreaseCount
();
#endif // OF_ENABLE_PROFILER
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
REGISTER_USER_KERNEL
(
"nvtx_start"
)
.
SetCreateFn
<
NvtxStartKernel
>
()
.
SetIsMatchedHob
(
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
.
SetInplaceProposalFn
([](
const
user_op
::
InferContext
&
,
user_op
::
AddInplaceArgPair
AddInplaceArgPairFn
)
->
Maybe
<
void
>
{
OF_RETURN_IF_ERROR
(
AddInplaceArgPairFn
(
"out"
,
0
,
"in"
,
0
,
false
));
return
Maybe
<
void
>::
Ok
();
});
class
NvtxEndKernel
final
:
public
user_op
::
OpKernel
{
public:
NvtxEndKernel
()
=
default
;
~
NvtxEndKernel
()
override
=
default
;
std
::
shared_ptr
<
user_op
::
OpKernelState
>
CreateOpKernelState
(
user_op
::
KernelInitContext
*
ctx
)
const
override
{
return
std
::
make_shared
<
NvtxOpKernelState
>
();
}
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
,
user_op
::
OpKernelState
*
state
,
const
user_op
::
OpKernelCache
*
)
const
override
{
const
user_op
::
Tensor
*
in
=
ctx
->
Tensor4ArgNameAndIndex
(
"in"
,
0
);
user_op
::
Tensor
*
out
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
const
ShapeView
&
in_shape
=
in
->
shape_view
();
CHECK_EQ
(
out
->
shape_view
(),
in_shape
);
const
DataType
in_data_type
=
in
->
data_type
();
CHECK_EQ
(
out
->
data_type
(),
in_data_type
);
#ifdef OF_ENABLE_PROFILER
auto
*
kernel_state
=
dynamic_cast
<
NvtxOpKernelState
*>
(
state
);
const
std
::
string
mark_prefix
=
ctx
->
Attr
<
std
::
string
>
(
"mark_prefix"
);
const
std
::
string
mark
=
mark_prefix
+
"-"
+
std
::
to_string
(
kernel_state
->
counter
());
auto
it
=
mark2range_id
.
find
(
mark
.
c_str
());
CHECK
(
it
!=
mark2range_id
.
end
());
roctx_range_id_t
range_id
=
it
->
second
;
mark2range_id
.
erase
(
it
);
roctxRangeStop
(
range_id
);
Memcpy
<
DeviceType
::
kCUDA
>
(
ctx
->
stream
(),
out
->
mut_dptr
<
void
>
(),
in
->
dptr
<
void
>
(),
in_shape
.
elem_cnt
()
*
GetSizeOfDataType
(
in_data_type
));
kernel_state
->
IncreaseCount
();
#endif
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
REGISTER_USER_KERNEL
(
"nvtx_end"
)
.
SetCreateFn
<
NvtxEndKernel
>
()
.
SetIsMatchedHob
(
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
.
SetInplaceProposalFn
([](
const
user_op
::
InferContext
&
,
user_op
::
AddInplaceArgPair
AddInplaceArgPairFn
)
->
Maybe
<
void
>
{
OF_RETURN_IF_ERROR
(
AddInplaceArgPairFn
(
"out"
,
0
,
"in"
,
0
,
false
));
return
Maybe
<
void
>::
Ok
();
});
}
// namespace oneflow
oneflow/user/kernels/stateful_opkernel.cpp
View file @
f262efc9
This diff is collapsed.
Click to expand it.
python/oneflow/test/modules/fused_dot_feature_interaction.py
0 → 100644
View file @
f262efc9
import
numpy
as
np
import
oneflow
as
flow
def
fused_dot_feature_interaction
(
x
,
y
,
self_interaction
=
False
,
output_padding
=
0
,
output_concat
=
None
,
dtype
=
flow
.
float32
):
# (bs, es) = x.shape
(
bs
,
dims
,
es
)
=
y
.
shape
if
self_interaction
:
offset
=
1
else
:
offset
=
0
li
=
flow
.
tensor
([
i
for
i
in
range
(
dims
+
1
)
for
j
in
range
(
i
+
offset
)])
lj
=
flow
.
tensor
([
j
for
i
in
range
(
dims
+
1
)
for
j
in
range
(
i
+
offset
)])
T
=
flow
.
cat
(
[
flow
.
reshape
(
x
,
(
bs
,
1
,
es
)),
y
,
],
dim
=
1
,
)
Z
=
flow
.
matmul
(
T
,
T
,
transpose_b
=
True
)
# gather_nd not support half, so cast to float32
Z
=
flow
.
cast
(
Z
,
flow
.
float32
)
Zflat
=
Z
[:,
li
,
lj
]
Zflat
=
flow
.
cast
(
Zflat
,
dtype
)
if
output_concat
is
not
None
:
R
=
flow
.
cat
([
output_concat
,
Zflat
],
dim
=
1
)
else
:
R
=
Zflat
if
output_padding
!=
0
:
padding_tensor
=
flow
.
tensor
(
np
.
zeros
((
bs
,
output_padding
)).
astype
(
np
.
float32
),
device
=
"cuda"
,
requires_grad
=
False
,
)
R
=
flow
.
cat
([
R
,
padding_tensor
],
dim
=
1
)
return
R
python/oneflow/test/profiler/test_profile_lenet.py
View file @
f262efc9
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment