Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
aeb37c2a
Unverified
Commit
aeb37c2a
authored
Oct 03, 2024
by
Lucas Wilkinson
Committed by
GitHub
Oct 03, 2024
Browse files
[CI/Build] Per file CUDA Archs (improve wheel size and dev build times) (#8845)
parent
3dbb215b
Changes
22
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
320 additions
and
0 deletions
+320
-0
tools/report_build_time_ninja.py
tools/report_build_time_ninja.py
+311
-0
vllm/_custom_ops.py
vllm/_custom_ops.py
+9
-0
No files found.
tools/report_build_time_ninja.py
0 → 100644
View file @
aeb37c2a
#!/usr/bin/env python3
# Copyright (c) 2018 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Modified version of: https://chromium.googlesource.com/chromium/tools/depot_tools.git/+/refs/heads/main/post_build_ninja_summary.py
"""Summarize the last ninja build, invoked with ninja's -C syntax.
> python3 tools/report_build_time_ninja.py -C build/..
Typical output looks like this:
```
Longest build steps for .cpp.o:
1.0 weighted s to build ...torch_bindings.cpp.o (12.4 s elapsed time)
2.0 weighted s to build ..._attn_c.dir/csrc... (23.5 s elapsed time)
2.6 weighted s to build ...torch_bindings.cpp.o (31.5 s elapsed time)
3.2 weighted s to build ...torch_bindings.cpp.o (38.5 s elapsed time)
Longest build steps for .so (linking):
0.1 weighted s to build _core_C.abi3.so (0.7 s elapsed time)
0.1 weighted s to build _moe_C.abi3.so (1.0 s elapsed time)
0.5 weighted s to build ...flash_attn_c.abi3.so (1.1 s elapsed time)
6.2 weighted s to build _C.abi3.so (6.2 s elapsed time)
Longest build steps for .cu.o:
15.3 weighted s to build ...machete_mm_... (183.5 s elapsed time)
15.3 weighted s to build ...machete_mm_... (183.5 s elapsed time)
15.3 weighted s to build ...machete_mm_... (183.6 s elapsed time)
15.3 weighted s to build ...machete_mm_... (183.7 s elapsed time)
15.5 weighted s to build ...machete_mm_... (185.6 s elapsed time)
15.5 weighted s to build ...machete_mm_... (185.9 s elapsed time)
15.5 weighted s to build ...machete_mm_... (186.2 s elapsed time)
37.4 weighted s to build ...scaled_mm_c3x.cu... (449.0 s elapsed time)
43.9 weighted s to build ...scaled_mm_c2x.cu... (527.4 s elapsed time)
344.8 weighted s to build ...attention_...cu.o (1087.2 s elapsed time)
1110.0 s weighted time (10120.4 s elapsed time sum, 9.1x parallelism)
134 build steps completed, average of 0.12/s
```
"""
import
argparse
import
errno
import
fnmatch
import
os
import
sys
from
collections
import
defaultdict
# The number of long build times to report:
long_count
=
10
# The number of long times by extension to report
long_ext_count
=
10
class
Target
:
"""Represents a single line read for a .ninja_log file."""
def
__init__
(
self
,
start
,
end
):
"""Creates a target object by passing in the start/end times in seconds
as a float."""
self
.
start
=
start
self
.
end
=
end
# A list of targets, appended to by the owner of this object.
self
.
targets
=
[]
self
.
weighted_duration
=
0.0
def
Duration
(
self
):
"""Returns the task duration in seconds as a float."""
return
self
.
end
-
self
.
start
def
SetWeightedDuration
(
self
,
weighted_duration
):
"""Sets the duration, in seconds, passed in as a float."""
self
.
weighted_duration
=
weighted_duration
def
WeightedDuration
(
self
):
"""Returns the task's weighted duration in seconds as a float.
Weighted_duration takes the elapsed time of the task and divides it
by how many other tasks were running at the same time. Thus, it
represents the approximate impact of this task on the total build time,
with serialized or serializing steps typically ending up with much
longer weighted durations.
weighted_duration should always be the same or shorter than duration.
"""
# Allow for modest floating-point errors
epsilon
=
0.000002
if
(
self
.
weighted_duration
>
self
.
Duration
()
+
epsilon
):
print
(
'%s > %s?'
%
(
self
.
weighted_duration
,
self
.
Duration
()))
assert
(
self
.
weighted_duration
<=
self
.
Duration
()
+
epsilon
)
return
self
.
weighted_duration
def
DescribeTargets
(
self
):
"""Returns a printable string that summarizes the targets."""
# Some build steps generate dozens of outputs - handle them sanely.
# The max_length was chosen so that it can fit most of the long
# single-target names, while minimizing word wrapping.
result
=
', '
.
join
(
self
.
targets
)
max_length
=
65
if
len
(
result
)
>
max_length
:
result
=
result
[:
max_length
]
+
'...'
return
result
# Copied with some modifications from ninjatracing
def
ReadTargets
(
log
,
show_all
):
"""Reads all targets from .ninja_log file |log_file|, sorted by duration.
The result is a list of Target objects."""
header
=
log
.
readline
()
assert
header
==
'# ninja log v5
\n
'
,
\
'unrecognized ninja log version %r'
%
header
targets_dict
=
{}
last_end_seen
=
0.0
for
line
in
log
:
parts
=
line
.
strip
().
split
(
'
\t
'
)
if
len
(
parts
)
!=
5
:
# If ninja.exe is rudely halted then the .ninja_log file may be
# corrupt. Silently continue.
continue
start
,
end
,
_
,
name
,
cmdhash
=
parts
# Ignore restat.
# Convert from integral milliseconds to float seconds.
start
=
int
(
start
)
/
1000.0
end
=
int
(
end
)
/
1000.0
if
not
show_all
and
end
<
last_end_seen
:
# An earlier time stamp means that this step is the first in a new
# build, possibly an incremental build. Throw away the previous
# data so that this new build will be displayed independently.
# This has to be done by comparing end times because records are
# written to the .ninja_log file when commands complete, so end
# times are guaranteed to be in order, but start times are not.
targets_dict
=
{}
target
=
None
if
cmdhash
in
targets_dict
:
target
=
targets_dict
[
cmdhash
]
if
not
show_all
and
(
target
.
start
!=
start
or
target
.
end
!=
end
):
# If several builds in a row just run one or two build steps
# then the end times may not go backwards so the last build may
# not be detected as such. However in many cases there will be a
# build step repeated in the two builds and the changed
# start/stop points for that command, identified by the hash,
# can be used to detect and reset the target dictionary.
targets_dict
=
{}
target
=
None
if
not
target
:
targets_dict
[
cmdhash
]
=
target
=
Target
(
start
,
end
)
last_end_seen
=
end
target
.
targets
.
append
(
name
)
return
list
(
targets_dict
.
values
())
def
GetExtension
(
target
,
extra_patterns
):
"""Return the file extension that best represents a target.
For targets that generate multiple outputs it is important to return a
consistent 'canonical' extension. Ultimately the goal is to group build steps
by type."""
for
output
in
target
.
targets
:
if
extra_patterns
:
for
fn_pattern
in
extra_patterns
.
split
(
';'
):
if
fnmatch
.
fnmatch
(
output
,
'*'
+
fn_pattern
+
'*'
):
return
fn_pattern
# Not a true extension, but a good grouping.
if
output
.
endswith
(
'type_mappings'
):
extension
=
'type_mappings'
break
# Capture two extensions if present. For example: file.javac.jar should
# be distinguished from file.interface.jar.
root
,
ext1
=
os
.
path
.
splitext
(
output
)
_
,
ext2
=
os
.
path
.
splitext
(
root
)
extension
=
ext2
+
ext1
# Preserve the order in the file name.
if
len
(
extension
)
==
0
:
extension
=
'(no extension found)'
if
ext1
in
[
'.pdb'
,
'.dll'
,
'.exe'
]:
extension
=
'PEFile (linking)'
# Make sure that .dll and .exe are grouped together and that the
# .dll.lib files don't cause these to be listed as libraries
break
if
ext1
in
[
'.so'
,
'.TOC'
]:
extension
=
'.so (linking)'
# Attempt to identify linking, avoid identifying as '.TOC'
break
# Make sure .obj files don't get categorized as mojo files
if
ext1
in
[
'.obj'
,
'.o'
]:
break
# Jars are the canonical output of java targets.
if
ext1
==
'.jar'
:
break
# Normalize all mojo related outputs to 'mojo'.
if
output
.
count
(
'.mojom'
)
>
0
:
extension
=
'mojo'
break
return
extension
def
SummarizeEntries
(
entries
,
extra_step_types
):
"""Print a summary of the passed in list of Target objects."""
# Create a list that is in order by time stamp and has entries for the
# beginning and ending of each build step (one time stamp may have multiple
# entries due to multiple steps starting/stopping at exactly the same time).
# Iterate through this list, keeping track of which tasks are running at all
# times. At each time step calculate a running total for weighted time so
# that when each task ends its own weighted time can easily be calculated.
task_start_stop_times
=
[]
earliest
=
-
1
latest
=
0
total_cpu_time
=
0
for
target
in
entries
:
if
earliest
<
0
or
target
.
start
<
earliest
:
earliest
=
target
.
start
if
target
.
end
>
latest
:
latest
=
target
.
end
total_cpu_time
+=
target
.
Duration
()
task_start_stop_times
.
append
((
target
.
start
,
'start'
,
target
))
task_start_stop_times
.
append
((
target
.
end
,
'stop'
,
target
))
length
=
latest
-
earliest
weighted_total
=
0.0
# Sort by the time/type records and ignore |target|
task_start_stop_times
.
sort
(
key
=
lambda
times
:
times
[:
2
])
# Now we have all task start/stop times sorted by when they happen. If a
# task starts and stops on the same time stamp then the start will come
# first because of the alphabet, which is important for making this work
# correctly.
# Track the tasks which are currently running.
running_tasks
=
{}
# Record the time we have processed up to so we know how to calculate time
# deltas.
last_time
=
task_start_stop_times
[
0
][
0
]
# Track the accumulated weighted time so that it can efficiently be added
# to individual tasks.
last_weighted_time
=
0.0
# Scan all start/stop events.
for
event
in
task_start_stop_times
:
time
,
action_name
,
target
=
event
# Accumulate weighted time up to now.
num_running
=
len
(
running_tasks
)
if
num_running
>
0
:
# Update the total weighted time up to this moment.
last_weighted_time
+=
(
time
-
last_time
)
/
float
(
num_running
)
if
action_name
==
'start'
:
# Record the total weighted task time when this task starts.
running_tasks
[
target
]
=
last_weighted_time
if
action_name
==
'stop'
:
# Record the change in the total weighted task time while this task
# ran.
weighted_duration
=
last_weighted_time
-
running_tasks
[
target
]
target
.
SetWeightedDuration
(
weighted_duration
)
weighted_total
+=
weighted_duration
del
running_tasks
[
target
]
last_time
=
time
assert
(
len
(
running_tasks
)
==
0
)
# Warn if the sum of weighted times is off by more than half a second.
if
abs
(
length
-
weighted_total
)
>
500
:
print
(
'Warning: Possible corrupt ninja log, results may be '
'untrustworthy. Length = %.3f, weighted total = %.3f'
%
(
length
,
weighted_total
))
entries_by_ext
=
defaultdict
(
list
)
for
target
in
entries
:
extension
=
GetExtension
(
target
,
extra_step_types
)
entries_by_ext
[
extension
].
append
(
target
)
for
key
,
values
in
entries_by_ext
.
items
():
print
(
' Longest build steps for %s:'
%
key
)
values
.
sort
(
key
=
lambda
x
:
x
.
WeightedDuration
())
for
target
in
values
[
-
long_count
:]:
print
(
' %8.1f weighted s to build %s (%.1f s elapsed time)'
%
(
target
.
WeightedDuration
(),
target
.
DescribeTargets
(),
target
.
Duration
()))
print
(
' %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
'parallelism)'
%
(
length
,
total_cpu_time
,
total_cpu_time
*
1.0
/
length
))
print
(
' %d build steps completed, average of %1.2f/s'
%
(
len
(
entries
),
len
(
entries
)
/
(
length
)))
def
main
():
log_file
=
'.ninja_log'
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'-C'
,
dest
=
'build_directory'
,
help
=
'Build directory.'
)
parser
.
add_argument
(
'-s'
,
'--step-types'
,
help
=
'semicolon separated fnmatch patterns for build-step grouping'
)
parser
.
add_argument
(
'--log-file'
,
help
=
"specific ninja log file to analyze."
)
args
,
_extra_args
=
parser
.
parse_known_args
()
if
args
.
build_directory
:
log_file
=
os
.
path
.
join
(
args
.
build_directory
,
log_file
)
if
args
.
log_file
:
log_file
=
args
.
log_file
if
args
.
step_types
:
# Make room for the extra build types.
global
long_ext_count
long_ext_count
+=
len
(
args
.
step_types
.
split
(
';'
))
try
:
with
open
(
log_file
,
'r'
)
as
log
:
entries
=
ReadTargets
(
log
,
False
)
SummarizeEntries
(
entries
,
args
.
step_types
)
except
IOError
:
print
(
'Log file %r not found, no build summary created.'
%
log_file
)
return
errno
.
ENOENT
if
__name__
==
'__main__'
:
sys
.
exit
(
main
())
vllm/_custom_ops.py
View file @
aeb37c2a
...
@@ -32,6 +32,15 @@ def hint_on_error(fn):
...
@@ -32,6 +32,15 @@ def hint_on_error(fn):
def
wrapper
(
*
args
,
**
kwargs
):
def
wrapper
(
*
args
,
**
kwargs
):
try
:
try
:
return
fn
(
*
args
,
**
kwargs
)
return
fn
(
*
args
,
**
kwargs
)
except
NotImplementedError
as
e
:
msg
=
(
"Error in calling custom op %s: %s
\n
"
"Not implemented or built, mostly likely because the current current device "
"does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
"incorrectly while building)"
)
logger
.
error
(
msg
,
fn
.
__name__
,
e
)
raise
NotImplementedError
(
msg
%
(
fn
.
__name__
,
e
))
from
e
except
AttributeError
as
e
:
except
AttributeError
as
e
:
msg
=
(
msg
=
(
"Error in calling custom op %s: %s
\n
"
"Error in calling custom op %s: %s
\n
"
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment