Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
7d1a83a9
Commit
7d1a83a9
authored
May 25, 2022
by
aiss
Browse files
push Deepspeed 0.6.3 rocm version
parent
ab5534fc
Changes
742
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1789 additions
and
47 deletions
+1789
-47
csrc/aio/py_lib/deepspeed_py_aio_handle.h
csrc/aio/py_lib/deepspeed_py_aio_handle.h
+68
-0
csrc/aio/py_lib/deepspeed_py_copy.cpp
csrc/aio/py_lib/deepspeed_py_copy.cpp
+133
-0
csrc/aio/py_lib/deepspeed_py_copy.h
csrc/aio/py_lib/deepspeed_py_copy.h
+42
-0
csrc/aio/py_lib/py_ds_aio.cpp
csrc/aio/py_lib/py_ds_aio.cpp
+41
-0
csrc/aio/py_test/aio_bench_generate_param.py
csrc/aio/py_test/aio_bench_generate_param.py
+96
-0
csrc/aio/py_test/aio_bench_perf_sweep.py
csrc/aio/py_test/aio_bench_perf_sweep.py
+397
-0
csrc/aio/py_test/ds_aio_basic.py
csrc/aio/py_test/ds_aio_basic.py
+144
-0
csrc/aio/py_test/ds_aio_handle.py
csrc/aio/py_test/ds_aio_handle.py
+176
-0
csrc/aio/py_test/parse_aio_stats.py
csrc/aio/py_test/parse_aio_stats.py
+154
-0
csrc/aio/py_test/perf_sweep_utils.py
csrc/aio/py_test/perf_sweep_utils.py
+8
-0
csrc/aio/py_test/run_read_sweep.sh
csrc/aio/py_test/run_read_sweep.sh
+75
-0
csrc/aio/py_test/run_write_sweep.sh
csrc/aio/py_test/run_write_sweep.sh
+82
-0
csrc/aio/py_test/single_process_config.json
csrc/aio/py_test/single_process_config.json
+29
-0
csrc/aio/py_test/test_ds_aio.py
csrc/aio/py_test/test_ds_aio.py
+101
-0
csrc/aio/py_test/test_ds_aio_utils.py
csrc/aio/py_test/test_ds_aio_utils.py
+59
-0
csrc/aio/py_test/validate_async_io.py
csrc/aio/py_test/validate_async_io.py
+9
-0
csrc/common/custom_cuda_kernel.cu
csrc/common/custom_cuda_kernel.cu
+39
-0
csrc/common/custom_hip_kernel.hip
csrc/common/custom_hip_kernel.hip
+41
-0
csrc/includes/Timer.h
csrc/includes/Timer.h
+47
-47
csrc/includes/Timer_hip.h
csrc/includes/Timer_hip.h
+48
-0
No files found.
csrc/aio/py_lib/deepspeed_py_aio_handle.h
0 → 100644
View file @
7d1a83a9
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <condition_variable>
#include <memory>
#include "deepspeed_aio_thread.h"
struct
deepspeed_aio_handle_t
{
std
::
unique_ptr
<
struct
aio_context
>
_aio_ctxt
;
const
bool
_single_submit
;
const
bool
_overlap_events
;
const
int
_num_threads
;
deepspeed_aio_config_t
_aio_config
;
std
::
vector
<
std
::
shared_ptr
<
struct
deepspeed_aio_thread_t
>>
_thread_contexts
;
std
::
vector
<
std
::
thread
>
_threads
;
int
_num_pending_ops
;
deepspeed_aio_handle_t
(
const
int
block_size
,
const
int
queue_depth
,
const
bool
single_submit
,
const
bool
overlap_events
,
const
int
num_threads
);
~
deepspeed_aio_handle_t
();
const
int
get_block_size
()
const
;
const
int
get_queue_depth
()
const
;
const
bool
get_single_submit
()
const
;
const
bool
get_overlap_events
()
const
;
const
int
get_thread_count
()
const
;
int
read
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
);
int
write
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
);
int
pread
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
,
const
bool
async
);
int
pwrite
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
,
const
bool
async
);
int
sync_pread
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
);
int
sync_pwrite
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
);
int
async_pread
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
);
int
async_pwrite
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
);
int
wait
();
void
_stop_threads
();
void
_schedule_aio_work
(
std
::
shared_ptr
<
struct
io_op_desc_t
>
scheduled_op
);
std
::
shared_ptr
<
struct
io_op_desc_t
>
_wait_for_aio_work
();
bool
_is_valid_parallel_aio_op
(
const
bool
read_op
,
const
long
long
int
num_bytes
);
};
csrc/aio/py_lib/deepspeed_py_copy.cpp
0 → 100644
View file @
7d1a83a9
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include "deepspeed_py_copy.h"
#include <omp.h>
#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
#if defined(__AVX512__) or defined(__AVX256__)
union
AVX_Data
{
#if defined(__AVX512__)
__m512
data
;
#else
__m256
data
;
#endif
};
#endif
static
void
helper_memcpy_1
(
float
*
dest
,
float
*
src
,
size_t
param_size
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
rounded_size
=
ROUND_DOWN
(
param_size
,
SIMD_WIDTH
);
for
(
size_t
t
=
0
;
t
<
rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
rounded_size
)
copy_size
=
rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
SIMD_WIDTH
)
{
AVX_Data
src_4
;
src_4
.
data
=
SIMD_LOAD
(
src
+
i
);
SIMD_STORE
(
dest
+
i
,
src_4
.
data
);
}
}
#endif
if
(
param_size
>
rounded_size
)
{
#pragma omp parallel for
for
(
size_t
k
=
rounded_size
;
k
<
param_size
;
k
++
)
{
dest
[
k
]
=
src
[
k
];
}
}
}
static
void
helper_memcpy_4
(
float
*
dest
,
float
*
src
,
size_t
param_size
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
rounded_size
=
ROUND_DOWN
(
param_size
,
(
SIMD_WIDTH
<<
2
));
for
(
size_t
t
=
0
;
t
<
rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
rounded_size
)
copy_size
=
rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
(
SIMD_WIDTH
<<
2
))
{
AVX_Data
src_4
[
4
];
src_4
[
0
].
data
=
SIMD_LOAD
(
src
+
i
);
src_4
[
1
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
);
src_4
[
2
].
data
=
SIMD_LOAD
(
src
+
i
+
(
SIMD_WIDTH
<<
1
));
src_4
[
3
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
3
);
SIMD_STORE
(
dest
+
i
,
src_4
[
0
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
,
src_4
[
1
].
data
);
SIMD_STORE
(
dest
+
i
+
(
SIMD_WIDTH
<<
1
),
src_4
[
2
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
3
,
src_4
[
3
].
data
);
}
}
#endif
if
(
param_size
>
rounded_size
)
helper_memcpy_1
((
dest
+
rounded_size
),
(
src
+
rounded_size
),
(
param_size
-
rounded_size
));
}
static
void
helper_mempcy_8
(
float
*
dest
,
float
*
src
,
size_t
param_size
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
rounded_size
=
ROUND_DOWN
(
param_size
,
(
SIMD_WIDTH
<<
2
));
for
(
size_t
t
=
0
;
t
<
rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
rounded_size
)
copy_size
=
rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
(
SIMD_WIDTH
<<
3
))
{
AVX_Data
src_4
[
8
];
src_4
[
0
].
data
=
SIMD_LOAD
(
src
+
i
);
src_4
[
1
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
);
src_4
[
2
].
data
=
SIMD_LOAD
(
src
+
i
+
(
SIMD_WIDTH
<<
1
));
src_4
[
3
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
3
);
src_4
[
4
].
data
=
SIMD_LOAD
(
src
+
i
+
(
SIMD_WIDTH
<<
2
));
src_4
[
5
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
5
);
src_4
[
6
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
6
);
src_4
[
7
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
7
);
SIMD_STORE
(
dest
+
i
,
src_4
[
0
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
,
src_4
[
1
].
data
);
SIMD_STORE
(
dest
+
i
+
(
SIMD_WIDTH
<<
1
),
src_4
[
2
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
3
,
src_4
[
3
].
data
);
SIMD_STORE
(
dest
+
i
+
(
SIMD_WIDTH
<<
2
),
src_4
[
4
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
5
,
src_4
[
5
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
6
,
src_4
[
6
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
7
,
src_4
[
7
].
data
);
}
}
#endif
if
(
param_size
>
rounded_size
)
helper_memcpy_4
((
dest
+
rounded_size
),
(
src
+
rounded_size
),
(
param_size
-
rounded_size
));
}
int
deepspeed_py_memcpy
(
torch
::
Tensor
&
dest
,
const
torch
::
Tensor
&
src
)
{
auto
dest_c
=
dest
.
contiguous
();
auto
src_c
=
src
.
contiguous
();
float
*
dest_ptr
=
(
float
*
)
dest_c
.
data_ptr
();
float
*
src_ptr
=
(
float
*
)
src_c
.
data_ptr
();
helper_mempcy_8
(
dest_ptr
,
src_ptr
,
dest_c
.
size
(
0
));
return
0
;
}
csrc/aio/py_lib/deepspeed_py_copy.h
0 → 100644
View file @
7d1a83a9
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#if (__x86_64__ || __i386__)
#include <cpuid.h>
#include <x86intrin.h>
#endif
#include <deepspeed_aio_common.h>
#include <stdlib.h>
#include <torch/extension.h>
#define TILE (1024 * 1024 * 1024)
#if defined(__AVX512__)
#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm512_loadu_ps(x)
#define SIMD_SET(x) _mm512_set1_ps(x)
#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
#define SIMD_WIDTH 16
#else
#if defined(__AVX256__)
#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm256_loadu_ps(x)
#define SIMD_SET(x) _mm256_set1_ps(x)
#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
#define SIMD_WIDTH 8
#endif
#endif
int
deepspeed_py_memcpy
(
torch
::
Tensor
&
dest
,
const
torch
::
Tensor
&
src
);
csrc/aio/py_lib/py_ds_aio.cpp
0 → 100644
View file @
7d1a83a9
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <torch/extension.h>
#include "deepspeed_py_aio_handle.h"
#include "deepspeed_py_copy.h"
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"aio_read"
,
&
deepspeed_py_aio_read
,
"DeepSpeed Asynchronous I/O Read"
);
m
.
def
(
"aio_write"
,
&
deepspeed_py_aio_write
,
"DeepSpeed Asynchronous I/O Write"
);
m
.
def
(
"deepspeed_memcpy"
,
&
deepspeed_py_memcpy
,
"DeepSpeed Memory Copy"
);
py
::
class_
<
deepspeed_aio_handle_t
>
(
m
,
"aio_handle"
)
.
def
(
py
::
init
<
const
int
,
const
int
,
const
bool
,
const
bool
,
const
int
>
())
.
def
(
"get_block_size"
,
&
deepspeed_aio_handle_t
::
get_block_size
)
.
def
(
"get_queue_depth"
,
&
deepspeed_aio_handle_t
::
get_queue_depth
)
.
def
(
"get_single_submit"
,
&
deepspeed_aio_handle_t
::
get_single_submit
)
.
def
(
"get_overlap_events"
,
&
deepspeed_aio_handle_t
::
get_overlap_events
)
.
def
(
"get_thread_count"
,
&
deepspeed_aio_handle_t
::
get_thread_count
)
.
def
(
"read"
,
&
deepspeed_aio_handle_t
::
read
)
.
def
(
"write"
,
&
deepspeed_aio_handle_t
::
write
)
.
def
(
"pread"
,
&
deepspeed_aio_handle_t
::
pread
)
.
def
(
"pwrite"
,
&
deepspeed_aio_handle_t
::
pwrite
)
.
def
(
"sync_pread"
,
&
deepspeed_aio_handle_t
::
sync_pread
)
.
def
(
"sync_pwrite"
,
&
deepspeed_aio_handle_t
::
sync_pwrite
)
.
def
(
"async_pread"
,
&
deepspeed_aio_handle_t
::
async_pread
)
.
def
(
"async_pwrite"
,
&
deepspeed_aio_handle_t
::
async_pwrite
)
.
def
(
"wait"
,
&
deepspeed_aio_handle_t
::
wait
);
}
csrc/aio/py_test/aio_bench_generate_param.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
import
argparse
import
json
from
parse_aio_stats
import
READ_SPEED
,
WRITE_SPEED
,
get_sorted_results
from
perf_sweep_utils
import
BENCH_LOG_DIR
,
READ_LOG_DIR
,
WRITE_LOG_DIR
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
default
=
BENCH_LOG_DIR
,
help
=
f
'Folder of performance sweep logs. Default is
{
os
.
path
.
join
(
"."
,
BENCH_LOG_DIR
)
}
'
)
args
=
parser
.
parse_args
()
print
(
f
'args =
{
args
}
'
)
return
args
def
validate_args
(
args
):
for
d
in
[
READ_LOG_DIR
,
WRITE_LOG_DIR
]:
log_dir
=
os
.
path
.
join
(
args
.
log_dir
,
d
)
if
not
os
.
path
.
isdir
(
log_dir
):
print
(
f
'
{
log_dir
}
folder is not existent'
)
return
False
return
True
def
convert_to_param
(
key
):
assert
len
(
key
)
==
6
return
{
"single_submit"
:
"true"
if
key
[
0
]
==
"single"
else
"false"
,
"overlap_events"
:
"true"
if
key
[
1
]
==
"overlap"
else
"false"
,
"thread_count"
:
int
(
key
[
3
]),
"queue_depth"
:
int
(
key
[
4
]),
"block_size"
:
int
(
key
[
5
])
}
def
generate_aio_param
(
read_log_dir
,
write_log_dir
):
_
,
read_results
=
get_sorted_results
(
read_log_dir
,
READ_SPEED
)
_
,
write_results
=
get_sorted_results
(
write_log_dir
,
WRITE_SPEED
)
combined_perf
=
{
key
[
1
:]:
value
for
key
,
value
in
read_results
.
items
()}
for
key
,
value
in
write_results
.
items
():
new_key
=
key
[
1
:]
if
new_key
in
combined_perf
:
combined_perf
[
new_key
]
+=
value
else
:
combined_perf
[
new_key
]
=
0
optimal_key
=
None
optimal_perf
=
0.0
for
key
,
value
in
combined_perf
.
items
():
if
value
>
optimal_perf
:
optimal_perf
=
value
optimal_key
=
key
aio_param
=
{
"aio"
:
convert_to_param
(
optimal_key
)}
read_perf_keys
=
{
key
[
1
:]:
key
for
key
in
read_results
.
keys
()}
write_perf_keys
=
{
key
[
1
:]:
key
for
key
in
write_results
.
keys
()}
optimal_config_read
=
read_results
.
get
(
read_perf_keys
[
optimal_key
],
None
)
optimal_config_write
=
write_results
.
get
(
write_perf_keys
[
optimal_key
],
None
)
print
(
f
'Best performance (GB/sec): read =
{
optimal_config_read
:
5.2
f
}
, write =
{
optimal_config_write
:
5.2
f
}
'
)
print
(
json
.
dumps
(
aio_param
,
indent
=
3
))
def
main
():
print
(
'Generate aio param'
)
args
=
parse_arguments
()
if
not
validate_args
(
args
):
quit
()
read_log_dir
=
os
.
path
.
join
(
args
.
log_dir
,
READ_LOG_DIR
)
write_log_dir
=
os
.
path
.
join
(
args
.
log_dir
,
WRITE_LOG_DIR
)
generate_aio_param
(
read_log_dir
,
write_log_dir
)
if
__name__
==
"__main__"
:
main
()
csrc/aio/py_test/aio_bench_perf_sweep.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
import
sys
import
argparse
import
json
import
itertools
import
subprocess
import
shutil
from
test_ds_aio_utils
import
refine_integer_value
from
perf_sweep_utils
import
READ_OP_DESC
,
WRITE_OP_DESC
,
BENCH_LOG_DIR
,
\
READ_IO_DIR
,
WRITE_IO_DIR
,
READ_LOG_DIR
,
WRITE_LOG_DIR
OTHER_OPTIONS
=
'--handle'
PERF_SCRIPT
=
'test_ds_aio.py'
DEFAULT_SWEEP_CONFIG
=
{
"block_size"
:
[
"128K"
,
"256K"
],
"queue_depth"
:
[
4
,
16
,
32
],
"overlap_events"
:
[
True
,
False
],
"io_parallel"
:
[
2
,
8
],
"single_submit"
:
[
False
]
}
class
Job
(
object
):
def
__init__
(
self
,
cmd_line
,
output_file
=
None
,
work_dir
=
None
):
self
.
cmd_line
=
cmd_line
self
.
output_file
=
output_file
self
.
work_dir
=
work_dir
self
.
output_fd
=
None
def
cmd
(
self
):
return
self
.
cmd_line
def
get_stdout
(
self
):
return
self
.
output_fd
def
get_stderr
(
self
):
return
self
.
output_fd
def
get_cwd
(
self
):
return
self
.
work_dir
def
open_output_file
(
self
):
if
self
.
output_file
is
not
None
:
self
.
output_fd
=
open
(
self
.
output_file
,
'w'
)
def
close_output_file
(
self
):
if
self
.
output_fd
is
not
None
:
self
.
output_fd
.
close
()
self
.
output_fd
=
None
class
SweepConfig
(
object
):
def
__init__
(
self
,
args
):
self
.
nvme_dir
=
args
.
nvme_dir
self
.
io_size
=
args
.
io_size
self
.
search_space
=
get_sweep_config_dict
(
args
.
sweep_config
)
self
.
read
=
not
args
.
no_read
self
.
write
=
not
args
.
no_write
self
.
flush_cache
=
not
args
.
no_sudo
self
.
log_dir
=
args
.
log_dir
self
.
loops
=
args
.
loops
self
.
other_options
=
f
'
{
OTHER_OPTIONS
}
--loops
{
args
.
loops
}
'
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--nvme_dir'
,
required
=
True
,
type
=
str
,
help
=
'Directory in which to perform I/O tests. A writeable directory on a NVMe device.'
)
parser
.
add_argument
(
'--sweep_config'
,
type
=
str
,
default
=
None
,
help
=
'Performance sweep configuration json file.'
)
parser
.
add_argument
(
'--no_read'
,
action
=
'store_true'
,
help
=
'Disable read performance measurements.'
)
parser
.
add_argument
(
'--no_write'
,
action
=
'store_true'
,
help
=
'Disable write performance measurements.'
)
parser
.
add_argument
(
'--io_size'
,
type
=
str
,
default
=
"400M"
,
help
=
'Number of I/O bytes to read/write for performance measurements.'
)
parser
.
add_argument
(
'--no_sudo'
,
action
=
'store_true'
,
help
=
'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.'
)
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
default
=
BENCH_LOG_DIR
,
help
=
f
'Output directory for performance log files. Default is
{
os
.
path
.
join
(
"."
,
BENCH_LOG_DIR
)
}
'
)
parser
.
add_argument
(
'--loops'
,
type
=
int
,
default
=
1
,
help
=
'Count of operation repetitions'
)
args
=
parser
.
parse_args
()
print
(
f
'args =
{
args
}
'
)
return
args
def
dump_cmd_lines
(
cmd_lines
):
print
(
f
'cmd line count =
{
len
(
cmd_lines
)
}
'
)
for
i
,
cmd
in
enumerate
(
cmd_lines
):
print
(
f
'
{
i
}
:
{
cmd
}
'
)
def
get_sweep_config_dict
(
sweep_config_json
):
if
sweep_config_json
is
None
:
return
DEFAULT_SWEEP_CONFIG
with
open
(
sweep_config_json
)
as
fp
:
sweep_config
=
json
.
load
(
fp
)
return
sweep_config
def
get_sweep_cmd_lines
(
sweep_config_dict
):
def
flatten_options
(
key
,
value_list
):
flat_list
=
[]
for
v
in
value_list
:
if
not
type
(
v
)
is
bool
:
flat_list
.
append
(
f
'--
{
key
}
{
v
}
'
)
elif
v
:
flat_list
.
append
(
f
'--
{
key
}
'
)
else
:
flat_list
.
append
(
' '
)
return
flat_list
flat_list
=
[
flatten_options
(
key
,
value
)
for
key
,
value
in
sweep_config_dict
.
items
()]
cmd_list
=
list
(
itertools
.
product
(
*
flat_list
))
cmd_list
=
[
list
(
cmd
)
for
cmd
in
cmd_list
]
#dump_cmd_lines(cmd_list)
return
cmd_list
def
run_job
(
job
):
args
=
' '
.
join
(
job
.
cmd
())
print
(
f
'args =
{
args
}
'
)
job
.
open_output_file
()
proc
=
subprocess
.
run
(
args
=
args
,
shell
=
True
,
stdout
=
job
.
get_stdout
(),
stderr
=
job
.
get_stderr
(),
cwd
=
job
.
get_cwd
())
job
.
close_output_file
()
assert
proc
.
returncode
==
0
,
\
f
"This command failed:
{
job
.
cmd
()
}
"
def
launch_sweep
(
sweep_jobs
,
sync_job
,
flush_cache_job
):
for
perf_job
in
sweep_jobs
:
if
flush_cache_job
is
not
None
:
run_job
(
sync_job
)
run_job
(
flush_cache_job
)
run_job
(
perf_job
)
run_job
(
sync_job
)
def
create_cmd_tags
(
cmd_line
):
tags
=
{}
for
param_value
in
cmd_line
:
fields
=
param_value
.
split
()
if
len
(
fields
)
==
1
:
tags
[
fields
[
0
]]
=
None
elif
len
(
fields
)
==
2
:
tags
[
fields
[
0
]]
=
fields
[
1
]
return
tags
def
get_log_file
(
io_op_desc
,
cmd_line
):
QUEUE_DEPTH
=
"--queue_depth"
BLOCK_SIZE
=
"--block_size"
SINGLE_SUBMIT
=
"--single_submit"
OVERLAP_EVENTS
=
"--overlap_events"
THREAD_COUNT
=
"--threads"
IO_PARALLEL
=
"--io_parallel"
tag_map
=
{
QUEUE_DEPTH
:
"d"
,
BLOCK_SIZE
:
"bs"
,
SINGLE_SUBMIT
:
"single"
,
OVERLAP_EVENTS
:
"overlap"
,
THREAD_COUNT
:
"t"
,
IO_PARALLEL
:
"p"
}
tag_default
=
{
QUEUE_DEPTH
:
1
,
BLOCK_SIZE
:
"1M"
,
SINGLE_SUBMIT
:
"block"
,
OVERLAP_EVENTS
:
"sequential"
,
THREAD_COUNT
:
1
,
IO_PARALLEL
:
1
}
def
get_default_value
(
tag
):
value
=
tag_default
[
tag
]
if
tag
in
[
SINGLE_SUBMIT
,
OVERLAP_EVENTS
]:
return
value
return
f
'
{
tag_map
[
tag
]
}{
value
}
'
def
get_config_value
(
tag
,
value
):
tag_key
=
tag_map
[
tag
]
if
value
is
None
:
return
tag_key
return
f
'
{
tag_key
}{
value
}
'
tag_list
=
[
SINGLE_SUBMIT
,
OVERLAP_EVENTS
,
THREAD_COUNT
,
IO_PARALLEL
,
QUEUE_DEPTH
,
BLOCK_SIZE
]
log_tags
=
[
io_op_desc
]
cmd_tags
=
create_cmd_tags
(
cmd_line
)
for
tag
in
tag_list
:
if
tag
in
cmd_tags
:
log_tags
.
append
(
get_config_value
(
tag
,
cmd_tags
[
tag
]))
else
:
log_tags
.
append
(
get_default_value
(
tag
))
log_file
=
'_'
.
join
(
log_tags
)
log_file
+=
'.txt'
return
log_file
def
create_perf_jobs
(
io_op_desc
,
log_dir
,
cmd_lines
):
py_cmd
=
[
'python'
,
os
.
path
.
join
(
script_path
(),
PERF_SCRIPT
)]
perf_jobs
=
[]
for
cmd
in
cmd_lines
:
log_file
=
os
.
path
.
join
(
log_dir
,
get_log_file
(
io_op_desc
,
cmd
))
job
=
Job
(
cmd_line
=
py_cmd
+
cmd
,
output_file
=
log_file
)
perf_jobs
.
append
(
job
)
return
perf_jobs
def
script_path
():
return
os
.
path
.
dirname
(
os
.
path
.
realpath
(
sys
.
argv
[
0
]))
def
async_io_setup
():
import
deepspeed
from
deepspeed.ops.aio
import
AsyncIOBuilder
return
AsyncIOBuilder
().
is_compatible
()
def
get_block_size_and_count
(
io_bytes
):
block_size
=
1
block_count
=
io_bytes
bytes_in_KB
=
1024
while
block_count
%
bytes_in_KB
==
0
:
block_size
*=
bytes_in_KB
block_count
/=
bytes_in_KB
return
int
(
block_size
),
int
(
block_count
)
def
create_read_file
(
sweep_config
):
read_folder
=
os
.
path
.
join
(
sweep_config
.
nvme_dir
,
f
'
{
READ_IO_DIR
}
'
)
os
.
makedirs
(
read_folder
,
exist_ok
=
True
)
read_file_name
=
os
.
path
.
join
(
read_folder
,
f
'random_
{
sweep_config
.
io_size
}
B.pt'
)
block_size
,
block_count
=
get_block_size_and_count
(
refine_integer_value
(
sweep_config
.
io_size
))
dd_job
=
Job
(
cmd_line
=
[
f
'dd if=/dev/urandom of=
{
read_file_name
}
bs=
{
block_size
}
count=
{
block_count
}
'
])
print
(
f
'[Start] Create read file of
{
sweep_config
.
io_size
}
bytes by running
{
dd_job
.
cmd
()
}
....'
)
run_job
(
dd_job
)
print
(
f
'[Done] Create read file of
{
sweep_config
.
io_size
}
bytes by running
{
dd_job
.
cmd
()
}
....'
)
return
read_folder
,
read_file_name
def
remove_folder
(
folder
):
assert
os
.
path
.
isdir
(
folder
),
f
"Error: cannot remove
{
folder
}
- folder not found"
shutil
.
rmtree
(
folder
)
def
run_read_sweep
(
sweep_config
,
flush_cache_job
,
sync_job
,
cmd_lines
):
read_folder
,
read_file_name
=
create_read_file
(
sweep_config
)
read_option
=
f
'--read_file
{
read_file_name
}
'
read_cmd_lines
=
[[
f
'
{
read_option
}
{
sweep_config
.
other_options
}
'
]
+
cmd
for
cmd
in
cmd_lines
]
#dump_cmd_lines(read_cmd_lines)
log_folder
=
os
.
path
.
join
(
sweep_config
.
log_dir
,
f
'
{
READ_LOG_DIR
}
'
)
os
.
makedirs
(
log_folder
,
exist_ok
=
True
)
perf_jobs
=
create_perf_jobs
(
io_op_desc
=
READ_OP_DESC
,
log_dir
=
log_folder
,
cmd_lines
=
read_cmd_lines
)
launch_sweep
(
sweep_jobs
=
perf_jobs
,
sync_job
=
sync_job
,
flush_cache_job
=
flush_cache_job
)
remove_folder
(
read_folder
)
def
run_write_sweep
(
sweep_config
,
flush_cache_job
,
sync_job
,
cmd_lines
):
write_folder
=
os
.
path
.
join
(
sweep_config
.
nvme_dir
,
f
'
{
WRITE_IO_DIR
}
'
)
os
.
makedirs
(
write_folder
,
exist_ok
=
True
)
write_file_name
=
os
.
path
.
join
(
write_folder
,
f
'random_
{
sweep_config
.
io_size
}
B.pt'
)
write_option
=
f
'--write_size
{
sweep_config
.
io_size
}
--write_file
{
write_file_name
}
'
write_cmd_lines
=
[[
f
'
{
write_option
}
{
sweep_config
.
other_options
}
'
]
+
cmd
for
cmd
in
cmd_lines
]
#dump_cmd_lines(write_cmd_lines)
log_folder
=
os
.
path
.
join
(
sweep_config
.
log_dir
,
f
'
{
WRITE_LOG_DIR
}
'
)
os
.
makedirs
(
log_folder
,
exist_ok
=
True
)
perf_jobs
=
create_perf_jobs
(
io_op_desc
=
WRITE_OP_DESC
,
log_dir
=
log_folder
,
cmd_lines
=
write_cmd_lines
)
launch_sweep
(
sweep_jobs
=
perf_jobs
,
sync_job
=
sync_job
,
flush_cache_job
=
flush_cache_job
)
remove_folder
(
write_folder
)
def
main
():
print
(
"Running performance sweep of deepspeed nvme library"
)
if
not
async_io_setup
():
error_msg
=
"""
Failing because environment is not properly configured for deepspeed async i/o module.
Possible fix: apt install libaio-dev.
"""
print
(
error_msg
)
quit
()
args
=
parse_arguments
()
sweep_config
=
SweepConfig
(
args
)
cmd_lines
=
get_sweep_cmd_lines
(
sweep_config
.
search_space
)
if
sweep_config
.
flush_cache
:
flush_cache_job
=
Job
(
cmd_line
=
[
'sudo'
,
'bash -c'
,
"'echo 1 > /proc/sys/vm/drop_caches'"
])
else
:
flush_cache_job
=
None
sync_job
=
Job
(
cmd_line
=
[
'sync'
])
if
sweep_config
.
read
:
run_read_sweep
(
sweep_config
,
flush_cache_job
,
sync_job
,
cmd_lines
)
if
sweep_config
.
write
:
run_write_sweep
(
sweep_config
,
flush_cache_job
,
sync_job
,
cmd_lines
)
if
__name__
==
"__main__"
:
main
()
csrc/aio/py_test/ds_aio_basic.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
torch
import
os
import
time
from
deepspeed.ops.aio
import
AsyncIOBuilder
from
multiprocessing
import
Pool
,
Barrier
from
test_ds_aio_utils
import
report_results
,
task_log
,
task_barrier
def
pre_basic
(
args
,
tid
,
read_op
):
io_string
=
"Read"
if
read_op
else
"Write"
num_bytes
=
os
.
path
.
getsize
(
args
.
read_file
)
if
read_op
else
args
.
write_size
file
=
args
.
read_file
if
read_op
else
f
'
{
args
.
write_file
}
.
{
tid
}
'
task_log
(
tid
,
f
'Allocate tensor of size
{
num_bytes
}
bytes'
)
buffer
=
torch
.
empty
(
num_bytes
,
dtype
=
torch
.
uint8
,
device
=
'cpu'
).
pin_memory
()
task_log
(
tid
,
f
'
{
io_string
}
file
{
file
}
of size
{
num_bytes
}
bytes from buffer on device
{
buffer
.
device
}
'
)
ctxt
=
{}
ctxt
[
'file'
]
=
file
ctxt
[
'num_bytes'
]
=
num_bytes
ctxt
[
'buffer'
]
=
buffer
ctxt
[
'elapsed_sec'
]
=
0
return
ctxt
def
pre_basic_read
(
pool_params
):
args
,
tid
=
pool_params
ctxt
=
pre_basic
(
args
,
tid
,
True
)
return
ctxt
def
pre_basic_write
(
pool_params
):
args
,
tid
=
pool_params
ctxt
=
pre_basic
(
args
,
tid
,
False
)
return
ctxt
def
post_basic
(
pool_params
):
_
,
_
,
ctxt
=
pool_params
ctxt
[
"buffer"
].
detach
()
ctxt
[
"buffer"
]
=
None
return
ctxt
def
main_basic_read
(
pool_params
):
args
,
tid
,
ctxt
=
pool_params
start_time
=
time
.
time
()
AsyncIOBuilder
().
load
().
aio_read
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
block_size
,
args
.
queue_depth
,
args
.
single_submit
,
args
.
overlap_events
,
args
.
validate
)
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
main_basic_write
(
pool_params
):
args
,
tid
,
ctxt
=
pool_params
start_time
=
time
.
time
()
AsyncIOBuilder
().
load
().
aio_write
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
block_size
,
args
.
queue_depth
,
args
.
single_submit
,
args
.
overlap_events
,
args
.
validate
)
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
get_schedule
(
args
,
read_op
):
schedule
=
{}
if
read_op
:
schedule
[
'pre'
]
=
pre_basic_read
schedule
[
'post'
]
=
post_basic
schedule
[
'main'
]
=
main_basic_read
else
:
schedule
[
'pre'
]
=
pre_basic_write
schedule
[
'post'
]
=
post_basic
schedule
[
'main'
]
=
main_basic_write
return
schedule
def
_aio_handle_tasklet
(
pool_params
):
args
,
tid
,
read_op
=
pool_params
# Create schedule
schedule
=
get_schedule
(
args
,
read_op
)
task_log
(
tid
,
f
'schedule =
{
schedule
}
'
)
task_barrier
(
aio_barrier
,
args
.
threads
)
# Run pre task
task_log
(
tid
,
f
'running pre-task'
)
ctxt
=
schedule
[
"pre"
]((
args
,
tid
))
task_barrier
(
aio_barrier
,
args
.
threads
)
# Run main tasks in a loop
ctxt
[
"main_task_sec"
]
=
0
for
i
in
range
(
args
.
loops
):
task_log
(
tid
,
f
'running main task
{
i
}
'
)
start_time
=
time
.
time
()
ctxt
=
schedule
[
"main"
]((
args
,
tid
,
ctxt
))
task_barrier
(
aio_barrier
,
args
.
threads
)
stop_time
=
time
.
time
()
ctxt
[
"main_task_sec"
]
+=
stop_time
-
start_time
# Run post task
task_log
(
tid
,
f
'running post-task'
)
ctxt
=
schedule
[
"post"
]((
args
,
tid
,
ctxt
))
task_barrier
(
aio_barrier
,
args
.
threads
)
return
ctxt
[
"main_task_sec"
],
ctxt
[
"elapsed_sec"
],
ctxt
[
"num_bytes"
]
*
args
.
loops
def
_init_tasklet
(
b
):
global
aio_barrier
aio_barrier
=
b
def
aio_basic_multiprocessing
(
args
,
read_op
):
b
=
Barrier
(
args
.
threads
)
pool_params
=
[(
args
,
p
,
read_op
)
for
p
in
range
(
args
.
threads
)]
with
Pool
(
processes
=
args
.
threads
,
initializer
=
_init_tasklet
,
initargs
=
(
b
,
))
as
p
:
pool_results
=
p
.
map
(
_aio_handle_tasklet
,
pool_params
)
report_results
(
args
,
read_op
,
pool_results
)
csrc/aio/py_test/ds_aio_handle.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
torch
import
os
import
time
from
multiprocessing
import
Pool
,
Barrier
from
deepspeed.ops.aio
import
AsyncIOBuilder
from
test_ds_aio_utils
import
report_results
,
task_log
,
task_barrier
def
pre_handle
(
args
,
tid
,
read_op
):
io_string
=
"Read"
if
read_op
else
"Write"
num_bytes
=
os
.
path
.
getsize
(
args
.
read_file
)
if
read_op
else
args
.
write_size
file
=
args
.
read_file
if
read_op
else
f
'
{
args
.
write_file
}
.
{
tid
}
'
task_log
(
tid
,
f
'Allocate tensor of size
{
num_bytes
}
bytes'
)
if
args
.
gpu
:
buffer
=
torch
.
empty
(
num_bytes
,
dtype
=
torch
.
uint8
,
device
=
'cuda'
)
else
:
buffer
=
torch
.
empty
(
num_bytes
,
dtype
=
torch
.
uint8
,
device
=
'cpu'
).
pin_memory
()
task_log
(
tid
,
f
'
{
io_string
}
file
{
file
}
of size
{
num_bytes
}
bytes from buffer on device
{
buffer
.
device
}
'
)
io_parallel
=
args
.
io_parallel
if
args
.
io_parallel
else
1
handle
=
AsyncIOBuilder
().
load
().
aio_handle
(
args
.
block_size
,
args
.
queue_depth
,
args
.
single_submit
,
args
.
overlap_events
,
io_parallel
)
task_log
(
tid
,
f
'created deepspeed aio handle'
)
ctxt
=
{}
ctxt
[
'file'
]
=
file
ctxt
[
'num_bytes'
]
=
num_bytes
ctxt
[
'handle'
]
=
handle
ctxt
[
'buffer'
]
=
buffer
ctxt
[
'elapsed_sec'
]
=
0
return
ctxt
def
pre_handle_read
(
pool_params
):
args
,
tid
=
pool_params
ctxt
=
pre_handle
(
args
,
tid
,
True
)
return
ctxt
def
pre_handle_write
(
pool_params
):
args
,
tid
=
pool_params
ctxt
=
pre_handle
(
args
,
tid
,
False
)
return
ctxt
def
post_handle
(
pool_params
):
_
,
_
,
ctxt
=
pool_params
ctxt
[
"buffer"
].
detach
()
ctxt
[
"buffer"
]
=
None
return
ctxt
def
main_parallel_read
(
pool_params
):
args
,
tid
,
ctxt
=
pool_params
handle
=
ctxt
[
'handle'
]
start_time
=
time
.
time
()
ret
=
handle
.
pread
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
validate
,
True
)
assert
ret
!=
-
1
handle
.
wait
()
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
main_parallel_write
(
pool_params
):
args
,
tid
,
ctxt
=
pool_params
handle
=
ctxt
[
'handle'
]
start_time
=
time
.
time
()
ret
=
handle
.
pwrite
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
validate
,
True
)
assert
ret
!=
-
1
handle
.
wait
()
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
main_handle_read
(
pool_parms
):
args
,
tid
,
ctxt
=
pool_parms
handle
=
ctxt
[
'handle'
]
start_time
=
time
.
time
()
ret
=
handle
.
read
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
validate
)
assert
ret
!=
-
1
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
main_handle_write
(
pool_parms
):
args
,
tid
,
ctxt
=
pool_parms
handle
=
ctxt
[
'handle'
]
start_time
=
time
.
time
()
ret
=
handle
.
write
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
validate
)
assert
ret
!=
-
1
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
get_schedule
(
args
,
read_op
):
schedule
=
{}
if
read_op
:
schedule
[
'pre'
]
=
pre_handle_read
schedule
[
'post'
]
=
post_handle
schedule
[
'main'
]
=
main_parallel_read
if
args
.
io_parallel
else
main_handle_read
else
:
schedule
[
'pre'
]
=
pre_handle_write
schedule
[
'post'
]
=
post_handle
schedule
[
'main'
]
=
main_parallel_write
if
args
.
io_parallel
else
main_handle_write
return
schedule
def
_aio_handle_tasklet
(
pool_params
):
args
,
tid
,
read_op
=
pool_params
# Create schedule
schedule
=
get_schedule
(
args
,
read_op
)
task_log
(
tid
,
f
'schedule =
{
schedule
}
'
)
task_barrier
(
aio_barrier
,
args
.
threads
)
# Run pre task
task_log
(
tid
,
f
'running pre-task'
)
ctxt
=
schedule
[
"pre"
]((
args
,
tid
))
task_barrier
(
aio_barrier
,
args
.
threads
)
# Run main tasks in a loop
ctxt
[
"main_task_sec"
]
=
0
for
i
in
range
(
args
.
loops
):
task_log
(
tid
,
f
'running main task
{
i
}
'
)
start_time
=
time
.
time
()
ctxt
=
schedule
[
"main"
]((
args
,
tid
,
ctxt
))
task_barrier
(
aio_barrier
,
args
.
threads
)
stop_time
=
time
.
time
()
ctxt
[
"main_task_sec"
]
+=
stop_time
-
start_time
# Run post task
task_log
(
tid
,
f
'running post-task'
)
ctxt
=
schedule
[
"post"
]((
args
,
tid
,
ctxt
))
task_barrier
(
aio_barrier
,
args
.
threads
)
return
ctxt
[
"main_task_sec"
],
ctxt
[
"elapsed_sec"
],
ctxt
[
"num_bytes"
]
*
args
.
loops
def
_init_tasklet
(
b
):
global
aio_barrier
aio_barrier
=
b
def
aio_handle_multiprocessing
(
args
,
read_op
):
b
=
Barrier
(
args
.
threads
)
pool_params
=
[(
args
,
p
,
read_op
)
for
p
in
range
(
args
.
threads
)]
with
Pool
(
processes
=
args
.
threads
,
initializer
=
_init_tasklet
,
initargs
=
(
b
,
))
as
p
:
pool_results
=
p
.
map
(
_aio_handle_tasklet
,
pool_params
)
report_results
(
args
,
read_op
,
pool_results
)
csrc/aio/py_test/parse_aio_stats.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
import
argparse
import
re
READ_SPEED
=
'read_speed'
WRITE_SPEED
=
'write_speed'
PERF_METRICS
=
[
READ_SPEED
,
WRITE_SPEED
]
METRIC_SEARCH
=
{
READ_SPEED
:
'E2E Read Speed'
,
WRITE_SPEED
:
'E2E Write Speed'
}
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
required
=
True
,
help
=
'Folder of statistics logs'
)
parser
.
add_argument
(
'--metric'
,
type
=
str
,
required
=
True
,
help
=
'Performance metric to report: [read_speed|write_speed]'
)
args
=
parser
.
parse_args
()
print
(
f
'args =
{
args
}
'
)
return
args
def
extract_value
(
key
,
file
):
INVALID_PREFIXES
=
[
"ds"
]
for
p
in
INVALID_PREFIXES
:
if
key
.
startswith
(
p
):
return
key
try
:
if
key
[
0
]
in
[
't'
,
'd'
,
'p'
]:
return
int
(
key
[
1
:])
if
key
.
startswith
(
"bs"
):
if
key
.
endswith
(
'K'
):
v
=
key
[
2
:].
split
(
'K'
)
return
int
(
v
[
0
])
*
1024
elif
key
.
endswith
(
'M'
):
v
=
key
[
2
:].
split
(
'M'
)
return
int
(
v
[
0
])
*
1024
*
1024
else
:
return
int
(
key
[
2
:])
except
:
print
(
f
"
{
file
}
: extract_value fails on
{
key
}
"
)
return
None
return
key
def
get_file_key
(
file
):
f
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
file
))
fields
=
f
.
split
(
'_'
)
values
=
[
extract_value
(
k
,
file
)
for
k
in
fields
]
return
tuple
(
values
)
def
get_thread_count
(
file
):
f
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
file
))
fields
=
f
.
split
(
'_'
)
for
key
in
fields
:
if
key
[
0
]
==
't'
:
return
int
(
key
[
1
:])
return
1
"""
Extract performance metric from log file.
Sample file lines are:
Task Read Latency = 0.031647682189941406 sec
Task Read Speed = 12.342926020792527 GB/sec
E2E Read Latency = 0.031697988510131836 sec
E2E Read Speed = 12.323337169333062 GB/sec
For the above sample, -metric = "read_speed" corresponds to "E2E Read Speed", and 12.32 will be returned
"""
def
get_metric
(
file
,
metric
):
thread_count
=
get_thread_count
(
file
)
with
open
(
file
)
as
f
:
for
line
in
f
.
readlines
():
if
line
.
startswith
(
METRIC_SEARCH
[
metric
]):
if
metric
in
[
READ_SPEED
,
WRITE_SPEED
]:
fields
=
line
.
split
()
return
float
(
fields
[
-
2
])
else
:
fields
=
line
.
split
(
'='
)
return
float
(
fields
[
-
1
])
return
None
def
validate_args
(
args
):
if
not
args
.
metric
in
PERF_METRICS
:
print
(
f
'
{
args
.
metric
}
is not a valid performance metrics'
)
return
False
if
not
os
.
path
.
isdir
(
args
.
log_dir
):
print
(
f
'
{
args
.
log_dir
}
folder is not existent'
)
return
False
return
True
def
get_results
(
log_files
,
metric
):
results
=
{}
for
f
in
log_files
:
file_key
=
get_file_key
(
f
)
value
=
get_metric
(
f
,
metric
)
results
[
file_key
]
=
value
return
results
def
get_sorted_results
(
log_dir
,
metric
):
log_files
=
[
f
for
f
in
os
.
listdir
(
log_dir
)
if
os
.
path
.
isfile
(
os
.
path
.
join
(
log_dir
,
f
))
]
log_files_path
=
[
os
.
path
.
join
(
log_dir
,
f
)
for
f
in
log_files
]
results
=
get_results
(
log_files_path
,
metric
)
result_keys
=
list
(
results
.
keys
())
sorted_keys
=
sorted
(
result_keys
)
return
sorted_keys
,
results
def
main
():
print
(
"Parsing aio statistics"
)
args
=
parse_arguments
()
if
not
validate_args
(
args
):
quit
()
sorted_keys
,
results
=
get_sorted_results
(
args
.
log_dir
,
args
.
metric
)
for
k
in
sorted_keys
:
print
(
f
'
{
k
}
=
{
results
[
k
]
}
'
)
if
__name__
==
"__main__"
:
main
()
csrc/aio/py_test/perf_sweep_utils.py
0 → 100644
View file @
7d1a83a9
SCRIPT_PREFIX
=
'_aio_bench'
WRITE_OP_DESC
=
'write'
READ_OP_DESC
=
'read'
READ_IO_DIR
=
f
'
{
SCRIPT_PREFIX
}
_
{
READ_OP_DESC
}
_io'
WRITE_IO_DIR
=
f
'
{
SCRIPT_PREFIX
}
_
{
WRITE_OP_DESC
}
_io'
BENCH_LOG_DIR
=
f
'
{
SCRIPT_PREFIX
}
_logs'
READ_LOG_DIR
=
f
'
{
SCRIPT_PREFIX
}
_
{
READ_OP_DESC
}
_logs'
WRITE_LOG_DIR
=
f
'
{
SCRIPT_PREFIX
}
_
{
WRITE_OP_DESC
}
_logs'
csrc/aio/py_test/run_read_sweep.sh
0 → 100644
View file @
7d1a83a9
#!/bin/bash
if
[[
$#
-ne
2
]]
;
then
echo
"Usage:
$0
<input file> <output log dir>"
exit
1
fi
function
validate_environment
()
{
validate_cmd
=
"python ./validate_async_io.py"
eval
${
validate_cmd
}
res
=
$?
if
[[
$res
!=
0
]]
;
then
echo
"Failing because environment is not properly configured"
echo
"Possible fix: sudo apt-get install libaio-dev"
exit
1
fi
}
validate_environment
INPUT_FILE
=
$1
if
[[
!
-f
${
INPUT_FILE
}
]]
;
then
echo
"Input file not found:
${
INPUT_FILE
}
"
exit
1
fi
LOG_DIR
=
$2
/aio_perf_sweep
RUN_SCRIPT
=
./test_ds_aio.py
READ_OPT
=
"--read_file
${
INPUT_FILE
}
"
if
[[
-d
${
LOG_DIR
}
]]
;
then
rm
-f
${
LOG_DIR
}
/
*
else
mkdir
-p
${
LOG_DIR
}
fi
DISABLE_CACHE
=
"sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
SYNC
=
"sync"
for
sub
in
single block
;
do
if
[[
$sub
==
"single"
]]
;
then
sub_opt
=
"--single_submit"
else
sub_opt
=
""
fi
for
ov
in
overlap sequential
;
do
if
[[
$ov
==
"overlap"
]]
;
then
ov_opt
=
"--overlap_events"
else
ov_opt
=
""
fi
for
t
in
1 2 4 8
;
do
for
p
in
1
;
do
for
d
in
1 2 4 8 16 32
;
do
for
bs
in
128K 256K 512K 1M
;
do
SCHED_OPTS
=
"
${
sub_opt
}
${
ov_opt
}
--handle --threads
${
t
}
"
OPTS
=
"--io_parallel
${
p
}
--queue_depth
${
d
}
--block_size
${
bs
}
"
LOG
=
"
${
LOG_DIR
}
/read_
${
sub
}
_
${
ov
}
_t
${
t
}
_p
${
p
}
_d
${
d
}
_bs
${
bs
}
.txt"
cmd
=
"python
${
RUN_SCRIPT
}
${
READ_OPT
}
${
OPTS
}
${
SCHED_OPTS
}
&>
${
LOG
}
"
echo
${
DISABLE_CACHE
}
echo
${
cmd
}
echo
${
SYNC
}
eval
${
DISABLE_CACHE
}
eval
${
cmd
}
eval
${
SYNC
}
sleep
2
done
done
done
done
done
done
csrc/aio/py_test/run_write_sweep.sh
0 → 100644
View file @
7d1a83a9
#!/bin/bash
function
prep_folder
()
{
folder
=
$1
if
[[
-d
${
folder
}
]]
;
then
rm
-f
${
folder
}
/
*
else
mkdir
-p
${
folder
}
fi
}
function
validate_environment
()
{
validate_cmd
=
"python ./validate_async_io.py"
eval
${
validate_cmd
}
res
=
$?
if
[[
$res
!=
0
]]
;
then
echo
"Failing because environment is not properly configured"
echo
"Possible fix: sudo apt-get install libaio-dev"
exit
1
fi
}
validate_environment
if
[[
$#
-ne
3
]]
;
then
echo
"Usage:
$0
<write size in MB> <write dir ><output log dir>"
exit
1
fi
SIZE
=
"
$1M
"
WRITE_DIR
=
$2
LOG_DIR
=
$3
/aio_perf_sweep
OUTPUT_FILE
=
${
WRITE_DIR
}
/ds_aio_write_
${
SIZE
}
B.pt
WRITE_OPT
=
"--write_file
${
OUTPUT_FILE
}
--write_size
${
SIZE
}
"
prep_folder
${
WRITE_DIR
}
prep_folder
${
LOG_DIR
}
RUN_SCRIPT
=
./test_ds_aio.py
DISABLE_CACHE
=
"sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
SYNC
=
"sync"
for
sub
in
single block
;
do
if
[[
$sub
==
"single"
]]
;
then
sub_opt
=
"--single_submit"
else
sub_opt
=
""
fi
for
ov
in
overlap sequential
;
do
if
[[
$ov
==
"overlap"
]]
;
then
ov_opt
=
"--overlap_events"
else
ov_opt
=
""
fi
for
t
in
1 2 4 8
;
do
for
p
in
1
;
do
for
d
in
1 2 4 8 16 32
;
do
for
bs
in
128K 256K 512K 1M
;
do
SCHED_OPTS
=
"
${
sub_opt
}
${
ov_opt
}
--handle --threads
${
t
}
"
OPTS
=
"--io_parallel
${
p
}
--queue_depth
${
d
}
--block_size
${
bs
}
"
LOG
=
"
${
LOG_DIR
}
/write_
${
sub
}
_
${
ov
}
_t
${
t
}
_p
${
p
}
_d
${
d
}
_bs
${
bs
}
.txt"
cmd
=
"python
${
RUN_SCRIPT
}
${
WRITE_OPT
}
${
OPTS
}
${
SCHED_OPTS
}
&>
${
LOG
}
"
echo
${
DISABLE_CACHE
}
echo
${
cmd
}
echo
${
SYNC
}
eval
${
DISABLE_CACHE
}
eval
${
cmd
}
eval
${
SYNC
}
sleep
2
done
done
done
done
done
done
csrc/aio/py_test/single_process_config.json
0 → 100644
View file @
7d1a83a9
{
"block_size"
:
[
"128K"
,
"256K"
,
"1M"
],
"queue_depth"
:
[
4
,
16
,
32
],
"io_parallel"
:
[
1
,
2
,
4
,
8
],
"single_submit"
:
[
true
,
false
],
"overlap_events"
:
[
true
,
false
],
"threads"
:
[
1
]
}
csrc/aio/py_test/test_ds_aio.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
import
torch
import
argparse
import
time
import
sys
from
multiprocessing
import
Pool
import
multiprocessing
as
mp
from
ds_aio_basic
import
aio_basic_multiprocessing
from
ds_aio_handle
import
aio_handle_multiprocessing
from
test_ds_aio_utils
import
refine_args
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--read_file'
,
type
=
str
,
default
=
None
,
help
=
'Read file.'
)
parser
.
add_argument
(
'--write_file'
,
type
=
str
,
default
=
None
,
help
=
'Write file.'
)
parser
.
add_argument
(
'--write_size'
,
type
=
str
,
default
=
None
,
help
=
'Number of bytes to write.'
)
parser
.
add_argument
(
'--block_size'
,
type
=
str
,
default
=
'1M'
,
help
=
'I/O block size.'
)
parser
.
add_argument
(
'--queue_depth'
,
type
=
int
,
default
=
32
,
help
=
'I/O queue depth.'
)
parser
.
add_argument
(
'--threads'
,
type
=
int
,
default
=
1
,
help
=
'Thread parallelism count.'
)
parser
.
add_argument
(
'--single_submit'
,
action
=
'store_true'
,
help
=
'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
)
parser
.
add_argument
(
'--overlap_events'
,
action
=
'store_true'
,
help
=
'Overlap I/O submission and completion requests.'
)
parser
.
add_argument
(
'--validate'
,
action
=
'store_true'
,
help
=
'Perform validation in library.'
)
parser
.
add_argument
(
'--handle'
,
action
=
'store_true'
,
help
=
'Use AIO handle.'
)
parser
.
add_argument
(
'--loops'
,
type
=
int
,
default
=
1
,
help
=
'Count of operation repetitions'
)
parser
.
add_argument
(
'--io_parallel'
,
type
=
int
,
default
=
None
,
help
=
'Per iop parallelism'
)
parser
.
add_argument
(
'--gpu'
,
action
=
'store_true'
,
help
=
'Use GPU memory'
)
args
=
parser
.
parse_args
()
print
(
f
'args =
{
args
}
'
)
return
args
def
validate_args
(
args
):
if
args
.
read_file
and
not
os
.
path
.
isfile
(
args
.
read_file
):
print
(
f
'args validation error:
{
args
.
read_file
}
not found'
)
return
False
return
True
def
main
():
print
(
f
'Testing deepspeed_aio python frontend'
)
args
=
parse_arguments
()
refine_args
(
args
)
if
not
validate_args
(
args
):
quit
()
mp
.
set_start_method
(
'spawn'
)
multiprocess_function
=
aio_handle_multiprocessing
if
args
.
handle
else
aio_basic_multiprocessing
if
args
.
read_file
:
multiprocess_function
(
args
,
True
)
if
args
.
write_file
:
multiprocess_function
(
args
,
False
)
if
__name__
==
"__main__"
:
main
()
csrc/aio/py_test/test_ds_aio_utils.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
BYTES_PER_GB
=
1024
**
3
LOG_TIDS
=
[
0
]
def
task_log
(
tid
,
msg
):
if
tid
in
LOG_TIDS
:
print
(
f
'tid
{
tid
}
:
{
msg
}
'
)
def
task_barrier
(
barrier
,
num_parties
):
assert
barrier
.
parties
==
num_parties
barrier
.
wait
()
assert
barrier
.
broken
==
False
def
report_results
(
args
,
read_op
,
pool_results
):
#print(f'pool_results = {pool_results}')
io_string
=
'Read'
if
read_op
else
'Write'
if
None
in
pool_results
:
print
(
f
'Failure in one of
{
args
.
threads
}
{
io_string
}
processes'
)
return
total_bytes
=
sum
([
num_bytes
for
_
,
_
,
num_bytes
in
pool_results
])
task_latency_sec
=
max
([
sec
for
_
,
sec
,
_
in
pool_results
])
task_speed_GB
=
total_bytes
/
task_latency_sec
/
BYTES_PER_GB
print
(
f
'Task
{
io_string
}
Latency =
{
task_latency_sec
}
sec'
)
print
(
f
'Task
{
io_string
}
Speed =
{
task_speed_GB
}
GB/sec'
)
e2e_latency_sec
=
max
([
sec
for
sec
,
_
,
_
in
pool_results
])
e2e_speed_GB
=
total_bytes
/
e2e_latency_sec
/
BYTES_PER_GB
print
(
f
'E2E
{
io_string
}
Latency =
{
e2e_latency_sec
}
sec'
)
print
(
f
'E2E
{
io_string
}
Speed =
{
e2e_speed_GB
}
GB/sec'
)
def
refine_integer_value
(
value
):
unit_dict
=
{
'K'
:
1024
,
'M'
:
1024
**
2
,
'G'
:
1024
**
3
}
if
value
[
-
1
]
in
list
(
unit_dict
.
keys
()):
int_value
=
int
(
value
[:
-
1
])
*
unit_dict
[
value
[
-
1
]]
return
int_value
return
int
(
value
)
def
refine_args
(
args
):
if
args
.
write_size
and
type
(
args
.
write_size
)
==
str
:
args
.
write_size
=
refine_integer_value
(
args
.
write_size
)
if
args
.
block_size
and
type
(
args
.
block_size
)
==
str
:
args
.
block_size
=
refine_integer_value
(
args
.
block_size
)
csrc/aio/py_test/validate_async_io.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
deepspeed
from
deepspeed.ops.aio
import
AsyncIOBuilder
assert
AsyncIOBuilder
().
is_compatible
()
csrc/common/custom_cuda_kernel.cu
0 → 100644
View file @
7d1a83a9
#include "custom_cuda_layers.h"
__global__
void
param_update_kernel
(
const
float
*
input
,
__half
*
output
,
int
size
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
<
size
)
{
output
[
id
]
=
(
__half
)
input
[
id
];
}
}
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
cudaStream_t
stream
)
{
int
threads
=
1024
;
dim3
grid_dim
((
size
-
1
)
/
threads
+
1
);
dim3
block_dim
(
threads
);
param_update_kernel
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
input
,
output
,
size
);
}
__global__
void
param_update_kernel_half
(
const
float
*
input
,
__half
*
output
,
int
size
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
__half2
*
output_cast
=
reinterpret_cast
<
__half2
*>
(
output
);
if
(
id
<
size
)
{
float
input_f
=
input
[
id
];
__half2
*
input_h
=
reinterpret_cast
<
__half2
*>
(
&
input_f
);
output_cast
[
id
]
=
*
input_h
;
}
}
void
launch_param_update_half
(
const
float
*
input
,
__half
*
output
,
int
size
,
cudaStream_t
stream
)
{
int
threads
=
1024
;
size
/=
2
;
dim3
grid_dim
((
size
-
1
)
/
threads
+
1
);
dim3
block_dim
(
threads
);
param_update_kernel_half
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
input
,
output
,
size
);
}
csrc/common/custom_hip_kernel.hip
0 → 100644
View file @
7d1a83a9
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
__global__ void param_update_kernel(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < size) { output[id] = (__half)input[id]; }
}
void launch_param_update(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
__global__ void param_update_kernel_half(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
__half2* output_cast = reinterpret_cast<__half2*>(output);
if (id < size) {
float input_f = input[id];
__half2* input_h = reinterpret_cast<__half2*>(&input_f);
output_cast[id] = *input_h;
}
}
void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
size /= 2;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel_half), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
csrc/includes/Timer.h
View file @
7d1a83a9
csrc/includes/Timer_hip.h
0 → 100644
View file @
7d1a83a9
// !!! This is a file automatically generated by hipify!!!
#ifndef __TIMER_H__
#define __TIMER_H__
#include <hip/hip_runtime.h>
#include <chrono>
#include "hip/hip_runtime.h"
class
GPUTimer
{
hipEvent_t
start
,
stop
;
public:
GPUTimer
()
{
hipEventCreate
(
&
start
);
hipEventCreate
(
&
stop
);
}
~
GPUTimer
()
{
hipEventDestroy
(
start
);
hipEventDestroy
(
stop
);
}
inline
void
Record
()
{
hipEventRecord
(
start
);
}
inline
void
Elapsed
(
float
&
time_elapsed
)
{
hipEventRecord
(
stop
);
hipEventSynchronize
(
stop
);
hipEventElapsedTime
(
&
time_elapsed
,
start
,
stop
);
}
};
class
CPUTimer
{
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
public:
CPUTimer
()
:
start
(
std
::
chrono
::
high_resolution_clock
::
now
())
{}
inline
void
Reset
()
{
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
inline
float
Elapsed
()
{
auto
temp
=
start
;
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
return
(
float
)(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
start
-
temp
).
count
()
/
1e3
);
}
};
#endif
Prev
1
2
3
4
5
6
7
…
38
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment