Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
dfd0363f
Unverified
Commit
dfd0363f
authored
Mar 14, 2022
by
HELSON
Committed by
GitHub
Mar 14, 2022
Browse files
polished output format for communication profiler and pcie profiler (#404)
fixed typing error
parent
aaead33c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
74 additions
and
44 deletions
+74
-44
colossalai/utils/profiler/comm_profiler.py
colossalai/utils/profiler/comm_profiler.py
+30
-17
colossalai/utils/profiler/pcie_profiler.py
colossalai/utils/profiler/pcie_profiler.py
+43
-26
colossalai/utils/profiler/prof_utils.py
colossalai/utils/profiler/prof_utils.py
+1
-1
No files found.
colossalai/utils/profiler/comm_profiler.py
View file @
dfd0363f
...
...
@@ -6,20 +6,25 @@ from torch.autograd.profiler import profile
import
torch.distributed
as
dist
from
torch.distributed
import
ReduceOp
from
colossalai.utils
import
get_current_device
from
.prof_utils
import
BaseProfiler
,
_format_time
,
_format_memory
,
_format_bandwith
from
.prof_utils
import
BaseProfiler
,
_format_time
,
_format_memory
,
_format_bandwi
d
th
from
typing
import
List
,
Optional
def
_get_code_location
(
depth
:
int
):
ret
=
""
length
=
len
(
inspect
.
stack
())
for
i
in
range
(
3
,
min
(
length
,
depth
+
1
)
):
ret
=
[]
length
=
min
(
len
(
inspect
.
stack
())
,
depth
+
1
)
for
i
in
range
(
3
,
length
):
upper_frame
=
inspect
.
stack
()[
i
]
function_name
=
inspect
.
stack
()[
i
-
1
].
function
info
=
upper_frame
.
filename
+
"("
+
str
(
upper_frame
.
lineno
)
+
"): "
+
function_name
+
"
\n
"
ret
+=
info
ret
.
append
(
upper_frame
.
filename
)
ret
.
append
(
'('
)
ret
.
append
(
str
(
upper_frame
.
lineno
))
ret
.
append
(
'): '
)
ret
.
append
(
function_name
)
if
i
!=
length
-
1
:
ret
.
append
(
'
\n
'
)
return
ret
return
''
.
join
(
ret
)
torch_all_reduce
=
dist
.
all_reduce
...
...
@@ -100,8 +105,9 @@ class CommProfiler(BaseProfiler):
def
result_list
(
self
,
sep
:
str
=
"
\n
"
):
res
=
[]
def
append
(
s
:
str
):
res
.
append
(
s
)
def
append
(
s
:
str
=
None
):
if
s
is
not
None
:
res
.
append
(
s
)
res
.
append
(
sep
)
if
self
.
warn_flag
:
...
...
@@ -110,19 +116,26 @@ class CommProfiler(BaseProfiler):
append
(
"Collective communication profiling result:"
)
append
(
"total cuda time: {}"
.
format
(
_format_time
(
self
.
total_cuda_time
)))
append
(
"average bandwith: {}"
.
format
(
_format_bandwith
(
self
.
total_comm_vol
,
self
.
total_cuda_time
)))
append
(
"average bandwi
d
th: {}"
.
format
(
_format_bandwi
d
th
(
self
.
total_comm_vol
,
self
.
total_cuda_time
)))
append
(
"total number of calls: {}"
.
format
(
self
.
total_count
))
append
(
"All events:
\n
----------------------------------------"
)
append
(
"All events:"
)
seperation
=
'-'
*
74
row_format
=
'{:^10}'
+
'{:^12}'
*
2
+
'{:^16}'
+
'{:^12}'
*
2
append
(
seperation
)
append
(
row_format
.
format
(
'Location'
,
'GPU time'
,
'Percentage'
,
'Comm volume'
,
'Bandwidth'
,
'Num of calls'
))
append
(
seperation
)
show_list
=
sorted
(
self
.
ops_record
.
items
(),
key
=
lambda
kv
:
-
kv
[
1
].
self_cuda_time
)
for
location
,
event
in
show_list
:
append
(
location
)
append
(
"self cuda time: {}"
.
format
(
_format_time
(
event
.
self_cuda_time
)))
append
(
"{:.1f}% of total communication time"
.
format
(
event
.
self_cuda_time
/
self
.
total_cuda_time
*
100.0
)
)
append
(
"self communication volme: {}"
.
format
(
_format_memory
(
event
.
self_comm_vol
)))
append
(
"average bandwith: {}"
.
format
(
_format_bandwith
(
event
.
self_comm_vol
,
event
.
self_cuda_time
)))
append
(
"number of calls: {}"
.
format
(
event
.
self_count
))
append
(
"----------------------------------------"
)
append
(
row_format
.
format
(
''
,
_
format
_time
(
event
.
self_cuda_time
)
,
'{:.1f}%'
.
format
(
event
.
self_cuda_time
/
self
.
total_cuda_time
*
100.0
),
_format_memory
(
event
.
self_comm_vol
),
_format_bandwidth
(
event
.
self_comm_vol
,
event
.
self_cuda_time
),
event
.
self_count
))
append
()
return
''
.
join
(
res
)
...
...
colossalai/utils/profiler/pcie_profiler.py
View file @
dfd0363f
from
pathlib
import
Path
from
torch.autograd.profiler
import
profile
from
.prof_utils
import
BaseProfiler
,
_format_time
,
_format_memory
,
_format_bandwith
from
.prof_utils
import
BaseProfiler
,
_format_time
,
_format_memory
,
_format_bandwi
d
th
from
typing
import
List
...
...
@@ -24,6 +24,7 @@ def _reduce_location(locations: List[str]) -> str:
for
lo
in
locations
:
ret
.
append
(
lo
)
ret
.
append
(
"
\n
"
)
ret
=
ret
[:
-
1
]
return
''
.
join
(
ret
)
...
...
@@ -48,18 +49,23 @@ class PcieProfiler(BaseProfiler):
TODO: Merge pcie profiler into communication profiler
"""
def
__init__
(
self
,
dtype
:
str
=
"fp32"
,
depth
:
int
=
1
,
total_count
:
int
=
0
,
total_pcie_vol
:
int
=
0
,
total_cuda_time
:
int
=
0
):
def
__init__
(
self
,
dtype
:
str
=
"fp32"
,
depth
:
int
=
1
):
super
().
__init__
(
profiler_name
=
"Pcie"
,
priority
=
10
)
self
.
depth
=
depth
self
.
data_size
=
_get_size
(
dtype
)
self
.
total_count
=
total_count
self
.
total_pcie_vol
=
total_pcie_vol
self
.
total_cuda_time
=
total_cuda_time
self
.
h2d_count
=
0
self
.
h2d_time
=
0
self
.
d2h_count
=
0
self
.
d2h_time
=
0
self
.
ops_record
=
dict
()
self
.
profiler
=
None
def
reset
(
self
):
self
.
h2d_count
=
0
self
.
h2d_time
=
0
self
.
d2h_count
=
0
self
.
d2h_time
=
0
self
.
ops_record
=
dict
()
self
.
profiler
=
None
...
...
@@ -81,17 +87,20 @@ class PcieProfiler(BaseProfiler):
for
event
in
events
:
if
event
.
name
==
"aten::copy_"
:
t_shape
=
event
.
input_shapes
[
0
]
if
len
(
t_shape
)
==
0
or
event
.
cuda_time_total
==
0
:
if
len
(
t_shape
)
==
0
or
event
.
cuda_time_total
==
0
or
len
(
event
.
stack
)
==
0
:
continue
current_comm_event
=
PcieEvent
(
1
,
self
.
data_size
*
_get_numel
(
t_shape
),
event
.
cuda_time_total
)
self
.
total_count
+=
current_comm_event
.
count
self
.
total_pcie_vol
+=
current_comm_event
.
pcie_vol
self
.
total_cuda_time
+=
current_comm_event
.
cuda_time
code_location
=
_reduce_location
(
event
.
stack
[:
self
.
depth
])
if
code_location
in
self
.
ops_record
:
self
.
ops_record
[
code_location
].
add
(
current_comm_event
)
else
:
self
.
ops_record
[
code_location
]
=
current_comm_event
elif
'Memcpy HtoD'
in
event
.
name
:
self
.
h2d_count
+=
1
self
.
h2d_time
+=
event
.
cuda_time_total
elif
'Memcpy DtoH'
in
event
.
name
:
self
.
d2h_count
+=
1
self
.
d2h_time
+=
event
.
cuda_time_total
self
.
profiler
=
None
...
...
@@ -108,24 +117,32 @@ class PcieProfiler(BaseProfiler):
def
result_list
(
self
,
sep
:
str
=
"
\n
"
):
res
=
[]
def
append
(
s
:
str
):
res
.
append
(
s
)
def
append
(
s
:
str
=
None
):
if
s
is
not
None
:
res
.
append
(
s
)
res
.
append
(
sep
)
append
(
"Pcie profiling result:"
)
append
(
"total cuda time: {}"
.
format
(
_format_time
(
self
.
total_cuda_time
)))
append
(
"average bandwith: {}"
.
format
(
_format_bandwith
(
self
.
total_pcie_vol
,
self
.
total_cuda_time
)))
append
(
"total number of calls: {}"
.
format
(
self
.
total_count
))
append
(
"All events:
\n
----------------------------------------"
)
append
(
"time of data transmission (CPU -> GPU): {}"
.
format
(
_format_time
(
self
.
h2d_time
)))
append
(
"number of transmission (CPU -> GPU): {}"
.
format
(
self
.
h2d_count
))
append
(
"time of data transmission (GPU -> CPU): {}"
.
format
(
_format_time
(
self
.
d2h_time
)))
append
(
"number of transmission (GPU -> CPU): {}"
.
format
(
self
.
d2h_count
))
append
(
"Possible data transmission events in PCIE:"
)
seperation
=
'-'
*
62
row_format
=
'{:^10}'
+
'{:^12}'
+
'{:^16}'
+
'{:^12}'
*
2
append
(
seperation
)
append
(
row_format
.
format
(
'Location'
,
'GPU time'
,
'Trans volume'
,
'Bandwidth'
,
'Num of calls'
))
append
(
seperation
)
show_list
=
sorted
(
self
.
ops_record
.
items
(),
key
=
lambda
kv
:
-
kv
[
1
].
cuda_time
)
for
location
,
event
in
show_list
:
append
(
location
)
append
(
"cuda time: {}"
.
format
(
_format_time
(
event
.
cuda_time
)))
append
(
"{:.1f}% of total pcie time"
.
format
(
event
.
cuda_time
/
self
.
total_cuda_time
*
100.0
))
append
(
"pcie volme: {}"
.
format
(
_format_memory
(
event
.
pcie_vol
)))
append
(
"average bandwith: {}"
.
format
(
_format_bandwith
(
event
.
pcie_vol
,
event
.
cuda_time
)))
append
(
"number of calls: {}"
.
format
(
event
.
count
))
append
(
"----------------------------------------"
)
append
(
row_format
.
format
(
''
,
_format_time
(
event
.
cuda_time
),
_format_memory
(
event
.
pcie_vol
),
_format_bandwidth
(
event
.
pcie_vol
,
event
.
cuda_time
),
event
.
count
))
append
()
return
''
.
join
(
res
)
colossalai/utils/profiler/prof_utils.py
View file @
dfd0363f
...
...
@@ -32,7 +32,7 @@ def _format_memory(nbytes):
return
str
(
nbytes
)
+
' B'
def
_format_bandwith
(
volme
:
float
or
int
,
time_us
:
int
):
def
_format_bandwi
d
th
(
volme
:
float
or
int
,
time_us
:
int
):
sec_div_mb
=
(
1000.0
/
1024.0
)
**
2
mb_per_sec
=
volme
/
time_us
*
sec_div_mb
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment