Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
7d1a83a9
"LinuxGUI/EXLIB/prepare_lib_mips64el.sh" did not exist on "7db83dc0f1820acaa84b2e00dce3c7a6e994c167"
Commit
7d1a83a9
authored
May 25, 2022
by
aiss
Browse files
push Deepspeed 0.6.3 rocm version
parent
ab5534fc
Changes
742
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1789 additions
and
47 deletions
+1789
-47
csrc/aio/py_lib/deepspeed_py_aio_handle.h
csrc/aio/py_lib/deepspeed_py_aio_handle.h
+68
-0
csrc/aio/py_lib/deepspeed_py_copy.cpp
csrc/aio/py_lib/deepspeed_py_copy.cpp
+133
-0
csrc/aio/py_lib/deepspeed_py_copy.h
csrc/aio/py_lib/deepspeed_py_copy.h
+42
-0
csrc/aio/py_lib/py_ds_aio.cpp
csrc/aio/py_lib/py_ds_aio.cpp
+41
-0
csrc/aio/py_test/aio_bench_generate_param.py
csrc/aio/py_test/aio_bench_generate_param.py
+96
-0
csrc/aio/py_test/aio_bench_perf_sweep.py
csrc/aio/py_test/aio_bench_perf_sweep.py
+397
-0
csrc/aio/py_test/ds_aio_basic.py
csrc/aio/py_test/ds_aio_basic.py
+144
-0
csrc/aio/py_test/ds_aio_handle.py
csrc/aio/py_test/ds_aio_handle.py
+176
-0
csrc/aio/py_test/parse_aio_stats.py
csrc/aio/py_test/parse_aio_stats.py
+154
-0
csrc/aio/py_test/perf_sweep_utils.py
csrc/aio/py_test/perf_sweep_utils.py
+8
-0
csrc/aio/py_test/run_read_sweep.sh
csrc/aio/py_test/run_read_sweep.sh
+75
-0
csrc/aio/py_test/run_write_sweep.sh
csrc/aio/py_test/run_write_sweep.sh
+82
-0
csrc/aio/py_test/single_process_config.json
csrc/aio/py_test/single_process_config.json
+29
-0
csrc/aio/py_test/test_ds_aio.py
csrc/aio/py_test/test_ds_aio.py
+101
-0
csrc/aio/py_test/test_ds_aio_utils.py
csrc/aio/py_test/test_ds_aio_utils.py
+59
-0
csrc/aio/py_test/validate_async_io.py
csrc/aio/py_test/validate_async_io.py
+9
-0
csrc/common/custom_cuda_kernel.cu
csrc/common/custom_cuda_kernel.cu
+39
-0
csrc/common/custom_hip_kernel.hip
csrc/common/custom_hip_kernel.hip
+41
-0
csrc/includes/Timer.h
csrc/includes/Timer.h
+47
-47
csrc/includes/Timer_hip.h
csrc/includes/Timer_hip.h
+48
-0
No files found.
csrc/aio/py_lib/deepspeed_py_aio_handle.h
0 → 100644
View file @
7d1a83a9
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <condition_variable>
#include <memory>
#include "deepspeed_aio_thread.h"
struct
deepspeed_aio_handle_t
{
std
::
unique_ptr
<
struct
aio_context
>
_aio_ctxt
;
const
bool
_single_submit
;
const
bool
_overlap_events
;
const
int
_num_threads
;
deepspeed_aio_config_t
_aio_config
;
std
::
vector
<
std
::
shared_ptr
<
struct
deepspeed_aio_thread_t
>>
_thread_contexts
;
std
::
vector
<
std
::
thread
>
_threads
;
int
_num_pending_ops
;
deepspeed_aio_handle_t
(
const
int
block_size
,
const
int
queue_depth
,
const
bool
single_submit
,
const
bool
overlap_events
,
const
int
num_threads
);
~
deepspeed_aio_handle_t
();
const
int
get_block_size
()
const
;
const
int
get_queue_depth
()
const
;
const
bool
get_single_submit
()
const
;
const
bool
get_overlap_events
()
const
;
const
int
get_thread_count
()
const
;
int
read
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
);
int
write
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
);
int
pread
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
,
const
bool
async
);
int
pwrite
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
,
const
bool
async
);
int
sync_pread
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
);
int
sync_pwrite
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
);
int
async_pread
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
);
int
async_pwrite
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
);
int
wait
();
void
_stop_threads
();
void
_schedule_aio_work
(
std
::
shared_ptr
<
struct
io_op_desc_t
>
scheduled_op
);
std
::
shared_ptr
<
struct
io_op_desc_t
>
_wait_for_aio_work
();
bool
_is_valid_parallel_aio_op
(
const
bool
read_op
,
const
long
long
int
num_bytes
);
};
csrc/aio/py_lib/deepspeed_py_copy.cpp
0 → 100644
View file @
7d1a83a9
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include "deepspeed_py_copy.h"
#include <omp.h>
#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
#if defined(__AVX512__) or defined(__AVX256__)
union
AVX_Data
{
#if defined(__AVX512__)
__m512
data
;
#else
__m256
data
;
#endif
};
#endif
static
void
helper_memcpy_1
(
float
*
dest
,
float
*
src
,
size_t
param_size
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
rounded_size
=
ROUND_DOWN
(
param_size
,
SIMD_WIDTH
);
for
(
size_t
t
=
0
;
t
<
rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
rounded_size
)
copy_size
=
rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
SIMD_WIDTH
)
{
AVX_Data
src_4
;
src_4
.
data
=
SIMD_LOAD
(
src
+
i
);
SIMD_STORE
(
dest
+
i
,
src_4
.
data
);
}
}
#endif
if
(
param_size
>
rounded_size
)
{
#pragma omp parallel for
for
(
size_t
k
=
rounded_size
;
k
<
param_size
;
k
++
)
{
dest
[
k
]
=
src
[
k
];
}
}
}
static
void
helper_memcpy_4
(
float
*
dest
,
float
*
src
,
size_t
param_size
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
rounded_size
=
ROUND_DOWN
(
param_size
,
(
SIMD_WIDTH
<<
2
));
for
(
size_t
t
=
0
;
t
<
rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
rounded_size
)
copy_size
=
rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
(
SIMD_WIDTH
<<
2
))
{
AVX_Data
src_4
[
4
];
src_4
[
0
].
data
=
SIMD_LOAD
(
src
+
i
);
src_4
[
1
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
);
src_4
[
2
].
data
=
SIMD_LOAD
(
src
+
i
+
(
SIMD_WIDTH
<<
1
));
src_4
[
3
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
3
);
SIMD_STORE
(
dest
+
i
,
src_4
[
0
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
,
src_4
[
1
].
data
);
SIMD_STORE
(
dest
+
i
+
(
SIMD_WIDTH
<<
1
),
src_4
[
2
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
3
,
src_4
[
3
].
data
);
}
}
#endif
if
(
param_size
>
rounded_size
)
helper_memcpy_1
((
dest
+
rounded_size
),
(
src
+
rounded_size
),
(
param_size
-
rounded_size
));
}
static
void
helper_mempcy_8
(
float
*
dest
,
float
*
src
,
size_t
param_size
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
rounded_size
=
ROUND_DOWN
(
param_size
,
(
SIMD_WIDTH
<<
2
));
for
(
size_t
t
=
0
;
t
<
rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
rounded_size
)
copy_size
=
rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
(
SIMD_WIDTH
<<
3
))
{
AVX_Data
src_4
[
8
];
src_4
[
0
].
data
=
SIMD_LOAD
(
src
+
i
);
src_4
[
1
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
);
src_4
[
2
].
data
=
SIMD_LOAD
(
src
+
i
+
(
SIMD_WIDTH
<<
1
));
src_4
[
3
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
3
);
src_4
[
4
].
data
=
SIMD_LOAD
(
src
+
i
+
(
SIMD_WIDTH
<<
2
));
src_4
[
5
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
5
);
src_4
[
6
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
6
);
src_4
[
7
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
7
);
SIMD_STORE
(
dest
+
i
,
src_4
[
0
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
,
src_4
[
1
].
data
);
SIMD_STORE
(
dest
+
i
+
(
SIMD_WIDTH
<<
1
),
src_4
[
2
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
3
,
src_4
[
3
].
data
);
SIMD_STORE
(
dest
+
i
+
(
SIMD_WIDTH
<<
2
),
src_4
[
4
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
5
,
src_4
[
5
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
6
,
src_4
[
6
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
7
,
src_4
[
7
].
data
);
}
}
#endif
if
(
param_size
>
rounded_size
)
helper_memcpy_4
((
dest
+
rounded_size
),
(
src
+
rounded_size
),
(
param_size
-
rounded_size
));
}
int
deepspeed_py_memcpy
(
torch
::
Tensor
&
dest
,
const
torch
::
Tensor
&
src
)
{
auto
dest_c
=
dest
.
contiguous
();
auto
src_c
=
src
.
contiguous
();
float
*
dest_ptr
=
(
float
*
)
dest_c
.
data_ptr
();
float
*
src_ptr
=
(
float
*
)
src_c
.
data_ptr
();
helper_mempcy_8
(
dest_ptr
,
src_ptr
,
dest_c
.
size
(
0
));
return
0
;
}
csrc/aio/py_lib/deepspeed_py_copy.h
0 → 100644
View file @
7d1a83a9
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#if (__x86_64__ || __i386__)
#include <cpuid.h>
#include <x86intrin.h>
#endif
#include <deepspeed_aio_common.h>
#include <stdlib.h>
#include <torch/extension.h>
#define TILE (1024 * 1024 * 1024)
#if defined(__AVX512__)
#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm512_loadu_ps(x)
#define SIMD_SET(x) _mm512_set1_ps(x)
#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
#define SIMD_WIDTH 16
#else
#if defined(__AVX256__)
#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm256_loadu_ps(x)
#define SIMD_SET(x) _mm256_set1_ps(x)
#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
#define SIMD_WIDTH 8
#endif
#endif
int
deepspeed_py_memcpy
(
torch
::
Tensor
&
dest
,
const
torch
::
Tensor
&
src
);
csrc/aio/py_lib/py_ds_aio.cpp
0 → 100644
View file @
7d1a83a9
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <torch/extension.h>
#include "deepspeed_py_aio_handle.h"
#include "deepspeed_py_copy.h"
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"aio_read"
,
&
deepspeed_py_aio_read
,
"DeepSpeed Asynchronous I/O Read"
);
m
.
def
(
"aio_write"
,
&
deepspeed_py_aio_write
,
"DeepSpeed Asynchronous I/O Write"
);
m
.
def
(
"deepspeed_memcpy"
,
&
deepspeed_py_memcpy
,
"DeepSpeed Memory Copy"
);
py
::
class_
<
deepspeed_aio_handle_t
>
(
m
,
"aio_handle"
)
.
def
(
py
::
init
<
const
int
,
const
int
,
const
bool
,
const
bool
,
const
int
>
())
.
def
(
"get_block_size"
,
&
deepspeed_aio_handle_t
::
get_block_size
)
.
def
(
"get_queue_depth"
,
&
deepspeed_aio_handle_t
::
get_queue_depth
)
.
def
(
"get_single_submit"
,
&
deepspeed_aio_handle_t
::
get_single_submit
)
.
def
(
"get_overlap_events"
,
&
deepspeed_aio_handle_t
::
get_overlap_events
)
.
def
(
"get_thread_count"
,
&
deepspeed_aio_handle_t
::
get_thread_count
)
.
def
(
"read"
,
&
deepspeed_aio_handle_t
::
read
)
.
def
(
"write"
,
&
deepspeed_aio_handle_t
::
write
)
.
def
(
"pread"
,
&
deepspeed_aio_handle_t
::
pread
)
.
def
(
"pwrite"
,
&
deepspeed_aio_handle_t
::
pwrite
)
.
def
(
"sync_pread"
,
&
deepspeed_aio_handle_t
::
sync_pread
)
.
def
(
"sync_pwrite"
,
&
deepspeed_aio_handle_t
::
sync_pwrite
)
.
def
(
"async_pread"
,
&
deepspeed_aio_handle_t
::
async_pread
)
.
def
(
"async_pwrite"
,
&
deepspeed_aio_handle_t
::
async_pwrite
)
.
def
(
"wait"
,
&
deepspeed_aio_handle_t
::
wait
);
}
csrc/aio/py_test/aio_bench_generate_param.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
import
argparse
import
json
from
parse_aio_stats
import
READ_SPEED
,
WRITE_SPEED
,
get_sorted_results
from
perf_sweep_utils
import
BENCH_LOG_DIR
,
READ_LOG_DIR
,
WRITE_LOG_DIR
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
default
=
BENCH_LOG_DIR
,
help
=
f
'Folder of performance sweep logs. Default is
{
os
.
path
.
join
(
"."
,
BENCH_LOG_DIR
)
}
'
)
args
=
parser
.
parse_args
()
print
(
f
'args =
{
args
}
'
)
return
args
def
validate_args
(
args
):
for
d
in
[
READ_LOG_DIR
,
WRITE_LOG_DIR
]:
log_dir
=
os
.
path
.
join
(
args
.
log_dir
,
d
)
if
not
os
.
path
.
isdir
(
log_dir
):
print
(
f
'
{
log_dir
}
folder is not existent'
)
return
False
return
True
def
convert_to_param
(
key
):
assert
len
(
key
)
==
6
return
{
"single_submit"
:
"true"
if
key
[
0
]
==
"single"
else
"false"
,
"overlap_events"
:
"true"
if
key
[
1
]
==
"overlap"
else
"false"
,
"thread_count"
:
int
(
key
[
3
]),
"queue_depth"
:
int
(
key
[
4
]),
"block_size"
:
int
(
key
[
5
])
}
def
generate_aio_param
(
read_log_dir
,
write_log_dir
):
_
,
read_results
=
get_sorted_results
(
read_log_dir
,
READ_SPEED
)
_
,
write_results
=
get_sorted_results
(
write_log_dir
,
WRITE_SPEED
)
combined_perf
=
{
key
[
1
:]:
value
for
key
,
value
in
read_results
.
items
()}
for
key
,
value
in
write_results
.
items
():
new_key
=
key
[
1
:]
if
new_key
in
combined_perf
:
combined_perf
[
new_key
]
+=
value
else
:
combined_perf
[
new_key
]
=
0
optimal_key
=
None
optimal_perf
=
0.0
for
key
,
value
in
combined_perf
.
items
():
if
value
>
optimal_perf
:
optimal_perf
=
value
optimal_key
=
key
aio_param
=
{
"aio"
:
convert_to_param
(
optimal_key
)}
read_perf_keys
=
{
key
[
1
:]:
key
for
key
in
read_results
.
keys
()}
write_perf_keys
=
{
key
[
1
:]:
key
for
key
in
write_results
.
keys
()}
optimal_config_read
=
read_results
.
get
(
read_perf_keys
[
optimal_key
],
None
)
optimal_config_write
=
write_results
.
get
(
write_perf_keys
[
optimal_key
],
None
)
print
(
f
'Best performance (GB/sec): read =
{
optimal_config_read
:
5.2
f
}
, write =
{
optimal_config_write
:
5.2
f
}
'
)
print
(
json
.
dumps
(
aio_param
,
indent
=
3
))
def
main
():
print
(
'Generate aio param'
)
args
=
parse_arguments
()
if
not
validate_args
(
args
):
quit
()
read_log_dir
=
os
.
path
.
join
(
args
.
log_dir
,
READ_LOG_DIR
)
write_log_dir
=
os
.
path
.
join
(
args
.
log_dir
,
WRITE_LOG_DIR
)
generate_aio_param
(
read_log_dir
,
write_log_dir
)
if
__name__
==
"__main__"
:
main
()
csrc/aio/py_test/aio_bench_perf_sweep.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
import
sys
import
argparse
import
json
import
itertools
import
subprocess
import
shutil
from
test_ds_aio_utils
import
refine_integer_value
from
perf_sweep_utils
import
READ_OP_DESC
,
WRITE_OP_DESC
,
BENCH_LOG_DIR
,
\
READ_IO_DIR
,
WRITE_IO_DIR
,
READ_LOG_DIR
,
WRITE_LOG_DIR
OTHER_OPTIONS
=
'--handle'
PERF_SCRIPT
=
'test_ds_aio.py'
DEFAULT_SWEEP_CONFIG
=
{
"block_size"
:
[
"128K"
,
"256K"
],
"queue_depth"
:
[
4
,
16
,
32
],
"overlap_events"
:
[
True
,
False
],
"io_parallel"
:
[
2
,
8
],
"single_submit"
:
[
False
]
}
class
Job
(
object
):
def
__init__
(
self
,
cmd_line
,
output_file
=
None
,
work_dir
=
None
):
self
.
cmd_line
=
cmd_line
self
.
output_file
=
output_file
self
.
work_dir
=
work_dir
self
.
output_fd
=
None
def
cmd
(
self
):
return
self
.
cmd_line
def
get_stdout
(
self
):
return
self
.
output_fd
def
get_stderr
(
self
):
return
self
.
output_fd
def
get_cwd
(
self
):
return
self
.
work_dir
def
open_output_file
(
self
):
if
self
.
output_file
is
not
None
:
self
.
output_fd
=
open
(
self
.
output_file
,
'w'
)
def
close_output_file
(
self
):
if
self
.
output_fd
is
not
None
:
self
.
output_fd
.
close
()
self
.
output_fd
=
None
class
SweepConfig
(
object
):
def
__init__
(
self
,
args
):
self
.
nvme_dir
=
args
.
nvme_dir
self
.
io_size
=
args
.
io_size
self
.
search_space
=
get_sweep_config_dict
(
args
.
sweep_config
)
self
.
read
=
not
args
.
no_read
self
.
write
=
not
args
.
no_write
self
.
flush_cache
=
not
args
.
no_sudo
self
.
log_dir
=
args
.
log_dir
self
.
loops
=
args
.
loops
self
.
other_options
=
f
'
{
OTHER_OPTIONS
}
--loops
{
args
.
loops
}
'
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--nvme_dir'
,
required
=
True
,
type
=
str
,
help
=
'Directory in which to perform I/O tests. A writeable directory on a NVMe device.'
)
parser
.
add_argument
(
'--sweep_config'
,
type
=
str
,
default
=
None
,
help
=
'Performance sweep configuration json file.'
)
parser
.
add_argument
(
'--no_read'
,
action
=
'store_true'
,
help
=
'Disable read performance measurements.'
)
parser
.
add_argument
(
'--no_write'
,
action
=
'store_true'
,
help
=
'Disable write performance measurements.'
)
parser
.
add_argument
(
'--io_size'
,
type
=
str
,
default
=
"400M"
,
help
=
'Number of I/O bytes to read/write for performance measurements.'
)
parser
.
add_argument
(
'--no_sudo'
,
action
=
'store_true'
,
help
=
'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.'
)
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
default
=
BENCH_LOG_DIR
,
help
=
f
'Output directory for performance log files. Default is
{
os
.
path
.
join
(
"."
,
BENCH_LOG_DIR
)
}
'
)
parser
.
add_argument
(
'--loops'
,
type
=
int
,
default
=
1
,
help
=
'Count of operation repetitions'
)
args
=
parser
.
parse_args
()
print
(
f
'args =
{
args
}
'
)
return
args
def
dump_cmd_lines
(
cmd_lines
):
print
(
f
'cmd line count =
{
len
(
cmd_lines
)
}
'
)
for
i
,
cmd
in
enumerate
(
cmd_lines
):
print
(
f
'
{
i
}
:
{
cmd
}
'
)
def
get_sweep_config_dict
(
sweep_config_json
):
if
sweep_config_json
is
None
:
return
DEFAULT_SWEEP_CONFIG
with
open
(
sweep_config_json
)
as
fp
:
sweep_config
=
json
.
load
(
fp
)
return
sweep_config
def
get_sweep_cmd_lines
(
sweep_config_dict
):
def
flatten_options
(
key
,
value_list
):
flat_list
=
[]
for
v
in
value_list
:
if
not
type
(
v
)
is
bool
:
flat_list
.
append
(
f
'--
{
key
}
{
v
}
'
)
elif
v
:
flat_list
.
append
(
f
'--
{
key
}
'
)
else
:
flat_list
.
append
(
' '
)
return
flat_list
flat_list
=
[
flatten_options
(
key
,
value
)
for
key
,
value
in
sweep_config_dict
.
items
()]
cmd_list
=
list
(
itertools
.
product
(
*
flat_list
))
cmd_list
=
[
list
(
cmd
)
for
cmd
in
cmd_list
]
#dump_cmd_lines(cmd_list)
return
cmd_list
def
run_job
(
job
):
args
=
' '
.
join
(
job
.
cmd
())
print
(
f
'args =
{
args
}
'
)
job
.
open_output_file
()
proc
=
subprocess
.
run
(
args
=
args
,
shell
=
True
,
stdout
=
job
.
get_stdout
(),
stderr
=
job
.
get_stderr
(),
cwd
=
job
.
get_cwd
())
job
.
close_output_file
()
assert
proc
.
returncode
==
0
,
\
f
"This command failed:
{
job
.
cmd
()
}
"
def
launch_sweep
(
sweep_jobs
,
sync_job
,
flush_cache_job
):
for
perf_job
in
sweep_jobs
:
if
flush_cache_job
is
not
None
:
run_job
(
sync_job
)
run_job
(
flush_cache_job
)
run_job
(
perf_job
)
run_job
(
sync_job
)
def
create_cmd_tags
(
cmd_line
):
tags
=
{}
for
param_value
in
cmd_line
:
fields
=
param_value
.
split
()
if
len
(
fields
)
==
1
:
tags
[
fields
[
0
]]
=
None
elif
len
(
fields
)
==
2
:
tags
[
fields
[
0
]]
=
fields
[
1
]
return
tags
def
get_log_file
(
io_op_desc
,
cmd_line
):
QUEUE_DEPTH
=
"--queue_depth"
BLOCK_SIZE
=
"--block_size"
SINGLE_SUBMIT
=
"--single_submit"
OVERLAP_EVENTS
=
"--overlap_events"
THREAD_COUNT
=
"--threads"
IO_PARALLEL
=
"--io_parallel"
tag_map
=
{
QUEUE_DEPTH
:
"d"
,
BLOCK_SIZE
:
"bs"
,
SINGLE_SUBMIT
:
"single"
,
OVERLAP_EVENTS
:
"overlap"
,
THREAD_COUNT
:
"t"
,
IO_PARALLEL
:
"p"
}
tag_default
=
{
QUEUE_DEPTH
:
1
,
BLOCK_SIZE
:
"1M"
,
SINGLE_SUBMIT
:
"block"
,
OVERLAP_EVENTS
:
"sequential"
,
THREAD_COUNT
:
1
,
IO_PARALLEL
:
1
}
def
get_default_value
(
tag
):
value
=
tag_default
[
tag
]
if
tag
in
[
SINGLE_SUBMIT
,
OVERLAP_EVENTS
]:
return
value
return
f
'
{
tag_map
[
tag
]
}{
value
}
'
def
get_config_value
(
tag
,
value
):
tag_key
=
tag_map
[
tag
]
if
value
is
None
:
return
tag_key
return
f
'
{
tag_key
}{
value
}
'
tag_list
=
[
SINGLE_SUBMIT
,
OVERLAP_EVENTS
,
THREAD_COUNT
,
IO_PARALLEL
,
QUEUE_DEPTH
,
BLOCK_SIZE
]
log_tags
=
[
io_op_desc
]
cmd_tags
=
create_cmd_tags
(
cmd_line
)
for
tag
in
tag_list
:
if
tag
in
cmd_tags
:
log_tags
.
append
(
get_config_value
(
tag
,
cmd_tags
[
tag
]))
else
:
log_tags
.
append
(
get_default_value
(
tag
))
log_file
=
'_'
.
join
(
log_tags
)
log_file
+=
'.txt'
return
log_file
def
create_perf_jobs
(
io_op_desc
,
log_dir
,
cmd_lines
):
py_cmd
=
[
'python'
,
os
.
path
.
join
(
script_path
(),
PERF_SCRIPT
)]
perf_jobs
=
[]
for
cmd
in
cmd_lines
:
log_file
=
os
.
path
.
join
(
log_dir
,
get_log_file
(
io_op_desc
,
cmd
))
job
=
Job
(
cmd_line
=
py_cmd
+
cmd
,
output_file
=
log_file
)
perf_jobs
.
append
(
job
)
return
perf_jobs
def
script_path
():
return
os
.
path
.
dirname
(
os
.
path
.
realpath
(
sys
.
argv
[
0
]))
def
async_io_setup
():
import
deepspeed
from
deepspeed.ops.aio
import
AsyncIOBuilder
return
AsyncIOBuilder
().
is_compatible
()
def
get_block_size_and_count
(
io_bytes
):
block_size
=
1
block_count
=
io_bytes
bytes_in_KB
=
1024
while
block_count
%
bytes_in_KB
==
0
:
block_size
*=
bytes_in_KB
block_count
/=
bytes_in_KB
return
int
(
block_size
),
int
(
block_count
)
def
create_read_file
(
sweep_config
):
read_folder
=
os
.
path
.
join
(
sweep_config
.
nvme_dir
,
f
'
{
READ_IO_DIR
}
'
)
os
.
makedirs
(
read_folder
,
exist_ok
=
True
)
read_file_name
=
os
.
path
.
join
(
read_folder
,
f
'random_
{
sweep_config
.
io_size
}
B.pt'
)
block_size
,
block_count
=
get_block_size_and_count
(
refine_integer_value
(
sweep_config
.
io_size
))
dd_job
=
Job
(
cmd_line
=
[
f
'dd if=/dev/urandom of=
{
read_file_name
}
bs=
{
block_size
}
count=
{
block_count
}
'
])
print
(
f
'[Start] Create read file of
{
sweep_config
.
io_size
}
bytes by running
{
dd_job
.
cmd
()
}
....'
)
run_job
(
dd_job
)
print
(
f
'[Done] Create read file of
{
sweep_config
.
io_size
}
bytes by running
{
dd_job
.
cmd
()
}
....'
)
return
read_folder
,
read_file_name
def
remove_folder
(
folder
):
assert
os
.
path
.
isdir
(
folder
),
f
"Error: cannot remove
{
folder
}
- folder not found"
shutil
.
rmtree
(
folder
)
def
run_read_sweep
(
sweep_config
,
flush_cache_job
,
sync_job
,
cmd_lines
):
read_folder
,
read_file_name
=
create_read_file
(
sweep_config
)
read_option
=
f
'--read_file
{
read_file_name
}
'
read_cmd_lines
=
[[
f
'
{
read_option
}
{
sweep_config
.
other_options
}
'
]
+
cmd
for
cmd
in
cmd_lines
]
#dump_cmd_lines(read_cmd_lines)
log_folder
=
os
.
path
.
join
(
sweep_config
.
log_dir
,
f
'
{
READ_LOG_DIR
}
'
)
os
.
makedirs
(
log_folder
,
exist_ok
=
True
)
perf_jobs
=
create_perf_jobs
(
io_op_desc
=
READ_OP_DESC
,
log_dir
=
log_folder
,
cmd_lines
=
read_cmd_lines
)
launch_sweep
(
sweep_jobs
=
perf_jobs
,
sync_job
=
sync_job
,
flush_cache_job
=
flush_cache_job
)
remove_folder
(
read_folder
)
def
run_write_sweep
(
sweep_config
,
flush_cache_job
,
sync_job
,
cmd_lines
):
write_folder
=
os
.
path
.
join
(
sweep_config
.
nvme_dir
,
f
'
{
WRITE_IO_DIR
}
'
)
os
.
makedirs
(
write_folder
,
exist_ok
=
True
)
write_file_name
=
os
.
path
.
join
(
write_folder
,
f
'random_
{
sweep_config
.
io_size
}
B.pt'
)
write_option
=
f
'--write_size
{
sweep_config
.
io_size
}
--write_file
{
write_file_name
}
'
write_cmd_lines
=
[[
f
'
{
write_option
}
{
sweep_config
.
other_options
}
'
]
+
cmd
for
cmd
in
cmd_lines
]
#dump_cmd_lines(write_cmd_lines)
log_folder
=
os
.
path
.
join
(
sweep_config
.
log_dir
,
f
'
{
WRITE_LOG_DIR
}
'
)
os
.
makedirs
(
log_folder
,
exist_ok
=
True
)
perf_jobs
=
create_perf_jobs
(
io_op_desc
=
WRITE_OP_DESC
,
log_dir
=
log_folder
,
cmd_lines
=
write_cmd_lines
)
launch_sweep
(
sweep_jobs
=
perf_jobs
,
sync_job
=
sync_job
,
flush_cache_job
=
flush_cache_job
)
remove_folder
(
write_folder
)
def
main
():
print
(
"Running performance sweep of deepspeed nvme library"
)
if
not
async_io_setup
():
error_msg
=
"""
Failing because environment is not properly configured for deepspeed async i/o module.
Possible fix: apt install libaio-dev.
"""
print
(
error_msg
)
quit
()
args
=
parse_arguments
()
sweep_config
=
SweepConfig
(
args
)
cmd_lines
=
get_sweep_cmd_lines
(
sweep_config
.
search_space
)
if
sweep_config
.
flush_cache
:
flush_cache_job
=
Job
(
cmd_line
=
[
'sudo'
,
'bash -c'
,
"'echo 1 > /proc/sys/vm/drop_caches'"
])
else
:
flush_cache_job
=
None
sync_job
=
Job
(
cmd_line
=
[
'sync'
])
if
sweep_config
.
read
:
run_read_sweep
(
sweep_config
,
flush_cache_job
,
sync_job
,
cmd_lines
)
if
sweep_config
.
write
:
run_write_sweep
(
sweep_config
,
flush_cache_job
,
sync_job
,
cmd_lines
)
if
__name__
==
"__main__"
:
main
()
csrc/aio/py_test/ds_aio_basic.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
torch
import
os
import
time
from
deepspeed.ops.aio
import
AsyncIOBuilder
from
multiprocessing
import
Pool
,
Barrier
from
test_ds_aio_utils
import
report_results
,
task_log
,
task_barrier
def
pre_basic
(
args
,
tid
,
read_op
):
io_string
=
"Read"
if
read_op
else
"Write"
num_bytes
=
os
.
path
.
getsize
(
args
.
read_file
)
if
read_op
else
args
.
write_size
file
=
args
.
read_file
if
read_op
else
f
'
{
args
.
write_file
}
.
{
tid
}
'
task_log
(
tid
,
f
'Allocate tensor of size
{
num_bytes
}
bytes'
)
buffer
=
torch
.
empty
(
num_bytes
,
dtype
=
torch
.
uint8
,
device
=
'cpu'
).
pin_memory
()
task_log
(
tid
,
f
'
{
io_string
}
file
{
file
}
of size
{
num_bytes
}
bytes from buffer on device
{
buffer
.
device
}
'
)
ctxt
=
{}
ctxt
[
'file'
]
=
file
ctxt
[
'num_bytes'
]
=
num_bytes
ctxt
[
'buffer'
]
=
buffer
ctxt
[
'elapsed_sec'
]
=
0
return
ctxt
def
pre_basic_read
(
pool_params
):
args
,
tid
=
pool_params
ctxt
=
pre_basic
(
args
,
tid
,
True
)
return
ctxt
def
pre_basic_write
(
pool_params
):
args
,
tid
=
pool_params
ctxt
=
pre_basic
(
args
,
tid
,
False
)
return
ctxt
def
post_basic
(
pool_params
):
_
,
_
,
ctxt
=
pool_params
ctxt
[
"buffer"
].
detach
()
ctxt
[
"buffer"
]
=
None
return
ctxt
def
main_basic_read
(
pool_params
):
args
,
tid
,
ctxt
=
pool_params
start_time
=
time
.
time
()
AsyncIOBuilder
().
load
().
aio_read
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
block_size
,
args
.
queue_depth
,
args
.
single_submit
,
args
.
overlap_events
,
args
.
validate
)
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
main_basic_write
(
pool_params
):
args
,
tid
,
ctxt
=
pool_params
start_time
=
time
.
time
()
AsyncIOBuilder
().
load
().
aio_write
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
block_size
,
args
.
queue_depth
,
args
.
single_submit
,
args
.
overlap_events
,
args
.
validate
)
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
get_schedule
(
args
,
read_op
):
schedule
=
{}
if
read_op
:
schedule
[
'pre'
]
=
pre_basic_read
schedule
[
'post'
]
=
post_basic
schedule
[
'main'
]
=
main_basic_read
else
:
schedule
[
'pre'
]
=
pre_basic_write
schedule
[
'post'
]
=
post_basic
schedule
[
'main'
]
=
main_basic_write
return
schedule
def
_aio_handle_tasklet
(
pool_params
):
args
,
tid
,
read_op
=
pool_params
# Create schedule
schedule
=
get_schedule
(
args
,
read_op
)
task_log
(
tid
,
f
'schedule =
{
schedule
}
'
)
task_barrier
(
aio_barrier
,
args
.
threads
)
# Run pre task
task_log
(
tid
,
f
'running pre-task'
)
ctxt
=
schedule
[
"pre"
]((
args
,
tid
))
task_barrier
(
aio_barrier
,
args
.
threads
)
# Run main tasks in a loop
ctxt
[
"main_task_sec"
]
=
0
for
i
in
range
(
args
.
loops
):
task_log
(
tid
,
f
'running main task
{
i
}
'
)
start_time
=
time
.
time
()
ctxt
=
schedule
[
"main"
]((
args
,
tid
,
ctxt
))
task_barrier
(
aio_barrier
,
args
.
threads
)
stop_time
=
time
.
time
()
ctxt
[
"main_task_sec"
]
+=
stop_time
-
start_time
# Run post task
task_log
(
tid
,
f
'running post-task'
)
ctxt
=
schedule
[
"post"
]((
args
,
tid
,
ctxt
))
task_barrier
(
aio_barrier
,
args
.
threads
)
return
ctxt
[
"main_task_sec"
],
ctxt
[
"elapsed_sec"
],
ctxt
[
"num_bytes"
]
*
args
.
loops
def
_init_tasklet
(
b
):
global
aio_barrier
aio_barrier
=
b
def
aio_basic_multiprocessing
(
args
,
read_op
):
b
=
Barrier
(
args
.
threads
)
pool_params
=
[(
args
,
p
,
read_op
)
for
p
in
range
(
args
.
threads
)]
with
Pool
(
processes
=
args
.
threads
,
initializer
=
_init_tasklet
,
initargs
=
(
b
,
))
as
p
:
pool_results
=
p
.
map
(
_aio_handle_tasklet
,
pool_params
)
report_results
(
args
,
read_op
,
pool_results
)
csrc/aio/py_test/ds_aio_handle.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
torch
import
os
import
time
from
multiprocessing
import
Pool
,
Barrier
from
deepspeed.ops.aio
import
AsyncIOBuilder
from
test_ds_aio_utils
import
report_results
,
task_log
,
task_barrier
def
pre_handle
(
args
,
tid
,
read_op
):
io_string
=
"Read"
if
read_op
else
"Write"
num_bytes
=
os
.
path
.
getsize
(
args
.
read_file
)
if
read_op
else
args
.
write_size
file
=
args
.
read_file
if
read_op
else
f
'
{
args
.
write_file
}
.
{
tid
}
'
task_log
(
tid
,
f
'Allocate tensor of size
{
num_bytes
}
bytes'
)
if
args
.
gpu
:
buffer
=
torch
.
empty
(
num_bytes
,
dtype
=
torch
.
uint8
,
device
=
'cuda'
)
else
:
buffer
=
torch
.
empty
(
num_bytes
,
dtype
=
torch
.
uint8
,
device
=
'cpu'
).
pin_memory
()
task_log
(
tid
,
f
'
{
io_string
}
file
{
file
}
of size
{
num_bytes
}
bytes from buffer on device
{
buffer
.
device
}
'
)
io_parallel
=
args
.
io_parallel
if
args
.
io_parallel
else
1
handle
=
AsyncIOBuilder
().
load
().
aio_handle
(
args
.
block_size
,
args
.
queue_depth
,
args
.
single_submit
,
args
.
overlap_events
,
io_parallel
)
task_log
(
tid
,
f
'created deepspeed aio handle'
)
ctxt
=
{}
ctxt
[
'file'
]
=
file
ctxt
[
'num_bytes'
]
=
num_bytes
ctxt
[
'handle'
]
=
handle
ctxt
[
'buffer'
]
=
buffer
ctxt
[
'elapsed_sec'
]
=
0
return
ctxt
def
pre_handle_read
(
pool_params
):
args
,
tid
=
pool_params
ctxt
=
pre_handle
(
args
,
tid
,
True
)
return
ctxt
def
pre_handle_write
(
pool_params
):
args
,
tid
=
pool_params
ctxt
=
pre_handle
(
args
,
tid
,
False
)
return
ctxt
def
post_handle
(
pool_params
):
_
,
_
,
ctxt
=
pool_params
ctxt
[
"buffer"
].
detach
()
ctxt
[
"buffer"
]
=
None
return
ctxt
def
main_parallel_read
(
pool_params
):
args
,
tid
,
ctxt
=
pool_params
handle
=
ctxt
[
'handle'
]
start_time
=
time
.
time
()
ret
=
handle
.
pread
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
validate
,
True
)
assert
ret
!=
-
1
handle
.
wait
()
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
main_parallel_write
(
pool_params
):
args
,
tid
,
ctxt
=
pool_params
handle
=
ctxt
[
'handle'
]
start_time
=
time
.
time
()
ret
=
handle
.
pwrite
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
validate
,
True
)
assert
ret
!=
-
1
handle
.
wait
()
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
main_handle_read
(
pool_parms
):
args
,
tid
,
ctxt
=
pool_parms
handle
=
ctxt
[
'handle'
]
start_time
=
time
.
time
()
ret
=
handle
.
read
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
validate
)
assert
ret
!=
-
1
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
main_handle_write
(
pool_parms
):
args
,
tid
,
ctxt
=
pool_parms
handle
=
ctxt
[
'handle'
]
start_time
=
time
.
time
()
ret
=
handle
.
write
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
validate
)
assert
ret
!=
-
1
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
get_schedule
(
args
,
read_op
):
schedule
=
{}
if
read_op
:
schedule
[
'pre'
]
=
pre_handle_read
schedule
[
'post'
]
=
post_handle
schedule
[
'main'
]
=
main_parallel_read
if
args
.
io_parallel
else
main_handle_read
else
:
schedule
[
'pre'
]
=
pre_handle_write
schedule
[
'post'
]
=
post_handle
schedule
[
'main'
]
=
main_parallel_write
if
args
.
io_parallel
else
main_handle_write
return
schedule
def
_aio_handle_tasklet
(
pool_params
):
args
,
tid
,
read_op
=
pool_params
# Create schedule
schedule
=
get_schedule
(
args
,
read_op
)
task_log
(
tid
,
f
'schedule =
{
schedule
}
'
)
task_barrier
(
aio_barrier
,
args
.
threads
)
# Run pre task
task_log
(
tid
,
f
'running pre-task'
)
ctxt
=
schedule
[
"pre"
]((
args
,
tid
))
task_barrier
(
aio_barrier
,
args
.
threads
)
# Run main tasks in a loop
ctxt
[
"main_task_sec"
]
=
0
for
i
in
range
(
args
.
loops
):
task_log
(
tid
,
f
'running main task
{
i
}
'
)
start_time
=
time
.
time
()
ctxt
=
schedule
[
"main"
]((
args
,
tid
,
ctxt
))
task_barrier
(
aio_barrier
,
args
.
threads
)
stop_time
=
time
.
time
()
ctxt
[
"main_task_sec"
]
+=
stop_time
-
start_time
# Run post task
task_log
(
tid
,
f
'running post-task'
)
ctxt
=
schedule
[
"post"
]((
args
,
tid
,
ctxt
))
task_barrier
(
aio_barrier
,
args
.
threads
)
return
ctxt
[
"main_task_sec"
],
ctxt
[
"elapsed_sec"
],
ctxt
[
"num_bytes"
]
*
args
.
loops
def
_init_tasklet
(
b
):
global
aio_barrier
aio_barrier
=
b
def
aio_handle_multiprocessing
(
args
,
read_op
):
b
=
Barrier
(
args
.
threads
)
pool_params
=
[(
args
,
p
,
read_op
)
for
p
in
range
(
args
.
threads
)]
with
Pool
(
processes
=
args
.
threads
,
initializer
=
_init_tasklet
,
initargs
=
(
b
,
))
as
p
:
pool_results
=
p
.
map
(
_aio_handle_tasklet
,
pool_params
)
report_results
(
args
,
read_op
,
pool_results
)
csrc/aio/py_test/parse_aio_stats.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
import
argparse
import
re
READ_SPEED
=
'read_speed'
WRITE_SPEED
=
'write_speed'
PERF_METRICS
=
[
READ_SPEED
,
WRITE_SPEED
]
METRIC_SEARCH
=
{
READ_SPEED
:
'E2E Read Speed'
,
WRITE_SPEED
:
'E2E Write Speed'
}
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
required
=
True
,
help
=
'Folder of statistics logs'
)
parser
.
add_argument
(
'--metric'
,
type
=
str
,
required
=
True
,
help
=
'Performance metric to report: [read_speed|write_speed]'
)
args
=
parser
.
parse_args
()
print
(
f
'args =
{
args
}
'
)
return
args
def
extract_value
(
key
,
file
):
INVALID_PREFIXES
=
[
"ds"
]
for
p
in
INVALID_PREFIXES
:
if
key
.
startswith
(
p
):
return
key
try
:
if
key
[
0
]
in
[
't'
,
'd'
,
'p'
]:
return
int
(
key
[
1
:])
if
key
.
startswith
(
"bs"
):
if
key
.
endswith
(
'K'
):
v
=
key
[
2
:].
split
(
'K'
)
return
int
(
v
[
0
])
*
1024
elif
key
.
endswith
(
'M'
):
v
=
key
[
2
:].
split
(
'M'
)
return
int
(
v
[
0
])
*
1024
*
1024
else
:
return
int
(
key
[
2
:])
except
:
print
(
f
"
{
file
}
: extract_value fails on
{
key
}
"
)
return
None
return
key
def
get_file_key
(
file
):
f
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
file
))
fields
=
f
.
split
(
'_'
)
values
=
[
extract_value
(
k
,
file
)
for
k
in
fields
]
return
tuple
(
values
)
def
get_thread_count
(
file
):
f
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
file
))
fields
=
f
.
split
(
'_'
)
for
key
in
fields
:
if
key
[
0
]
==
't'
:
return
int
(
key
[
1
:])
return
1
"""
Extract performance metric from log file.
Sample file lines are:
Task Read Latency = 0.031647682189941406 sec
Task Read Speed = 12.342926020792527 GB/sec
E2E Read Latency = 0.031697988510131836 sec
E2E Read Speed = 12.323337169333062 GB/sec
For the above sample, -metric = "read_speed" corresponds to "E2E Read Speed", and 12.32 will be returned
"""
def
get_metric
(
file
,
metric
):
thread_count
=
get_thread_count
(
file
)
with
open
(
file
)
as
f
:
for
line
in
f
.
readlines
():
if
line
.
startswith
(
METRIC_SEARCH
[
metric
]):
if
metric
in
[
READ_SPEED
,
WRITE_SPEED
]:
fields
=
line
.
split
()
return
float
(
fields
[
-
2
])
else
:
fields
=
line
.
split
(
'='
)
return
float
(
fields
[
-
1
])
return
None
def
validate_args
(
args
):
if
not
args
.
metric
in
PERF_METRICS
:
print
(
f
'
{
args
.
metric
}
is not a valid performance metrics'
)
return
False
if
not
os
.
path
.
isdir
(
args
.
log_dir
):
print
(
f
'
{
args
.
log_dir
}
folder is not existent'
)
return
False
return
True
def
get_results
(
log_files
,
metric
):
results
=
{}
for
f
in
log_files
:
file_key
=
get_file_key
(
f
)
value
=
get_metric
(
f
,
metric
)
results
[
file_key
]
=
value
return
results
def
get_sorted_results
(
log_dir
,
metric
):
log_files
=
[
f
for
f
in
os
.
listdir
(
log_dir
)
if
os
.
path
.
isfile
(
os
.
path
.
join
(
log_dir
,
f
))
]
log_files_path
=
[
os
.
path
.
join
(
log_dir
,
f
)
for
f
in
log_files
]
results
=
get_results
(
log_files_path
,
metric
)
result_keys
=
list
(
results
.
keys
())
sorted_keys
=
sorted
(
result_keys
)
return
sorted_keys
,
results
def
main
():
print
(
"Parsing aio statistics"
)
args
=
parse_arguments
()
if
not
validate_args
(
args
):
quit
()
sorted_keys
,
results
=
get_sorted_results
(
args
.
log_dir
,
args
.
metric
)
for
k
in
sorted_keys
:
print
(
f
'
{
k
}
=
{
results
[
k
]
}
'
)
if
__name__
==
"__main__"
:
main
()
csrc/aio/py_test/perf_sweep_utils.py
0 → 100644
View file @
7d1a83a9
SCRIPT_PREFIX
=
'_aio_bench'
WRITE_OP_DESC
=
'write'
READ_OP_DESC
=
'read'
READ_IO_DIR
=
f
'
{
SCRIPT_PREFIX
}
_
{
READ_OP_DESC
}
_io'
WRITE_IO_DIR
=
f
'
{
SCRIPT_PREFIX
}
_
{
WRITE_OP_DESC
}
_io'
BENCH_LOG_DIR
=
f
'
{
SCRIPT_PREFIX
}
_logs'
READ_LOG_DIR
=
f
'
{
SCRIPT_PREFIX
}
_
{
READ_OP_DESC
}
_logs'
WRITE_LOG_DIR
=
f
'
{
SCRIPT_PREFIX
}
_
{
WRITE_OP_DESC
}
_logs'
csrc/aio/py_test/run_read_sweep.sh
0 → 100644
View file @
7d1a83a9
#!/bin/bash
if
[[
$#
-ne
2
]]
;
then
echo
"Usage:
$0
<input file> <output log dir>"
exit
1
fi
function
validate_environment
()
{
validate_cmd
=
"python ./validate_async_io.py"
eval
${
validate_cmd
}
res
=
$?
if
[[
$res
!=
0
]]
;
then
echo
"Failing because environment is not properly configured"
echo
"Possible fix: sudo apt-get install libaio-dev"
exit
1
fi
}
validate_environment
INPUT_FILE
=
$1
if
[[
!
-f
${
INPUT_FILE
}
]]
;
then
echo
"Input file not found:
${
INPUT_FILE
}
"
exit
1
fi
LOG_DIR
=
$2
/aio_perf_sweep
RUN_SCRIPT
=
./test_ds_aio.py
READ_OPT
=
"--read_file
${
INPUT_FILE
}
"
if
[[
-d
${
LOG_DIR
}
]]
;
then
rm
-f
${
LOG_DIR
}
/
*
else
mkdir
-p
${
LOG_DIR
}
fi
DISABLE_CACHE
=
"sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
SYNC
=
"sync"
for
sub
in
single block
;
do
if
[[
$sub
==
"single"
]]
;
then
sub_opt
=
"--single_submit"
else
sub_opt
=
""
fi
for
ov
in
overlap sequential
;
do
if
[[
$ov
==
"overlap"
]]
;
then
ov_opt
=
"--overlap_events"
else
ov_opt
=
""
fi
for
t
in
1 2 4 8
;
do
for
p
in
1
;
do
for
d
in
1 2 4 8 16 32
;
do
for
bs
in
128K 256K 512K 1M
;
do
SCHED_OPTS
=
"
${
sub_opt
}
${
ov_opt
}
--handle --threads
${
t
}
"
OPTS
=
"--io_parallel
${
p
}
--queue_depth
${
d
}
--block_size
${
bs
}
"
LOG
=
"
${
LOG_DIR
}
/read_
${
sub
}
_
${
ov
}
_t
${
t
}
_p
${
p
}
_d
${
d
}
_bs
${
bs
}
.txt"
cmd
=
"python
${
RUN_SCRIPT
}
${
READ_OPT
}
${
OPTS
}
${
SCHED_OPTS
}
&>
${
LOG
}
"
echo
${
DISABLE_CACHE
}
echo
${
cmd
}
echo
${
SYNC
}
eval
${
DISABLE_CACHE
}
eval
${
cmd
}
eval
${
SYNC
}
sleep
2
done
done
done
done
done
done
csrc/aio/py_test/run_write_sweep.sh
0 → 100644
View file @
7d1a83a9
#!/bin/bash
function
prep_folder
()
{
folder
=
$1
if
[[
-d
${
folder
}
]]
;
then
rm
-f
${
folder
}
/
*
else
mkdir
-p
${
folder
}
fi
}
function
validate_environment
()
{
validate_cmd
=
"python ./validate_async_io.py"
eval
${
validate_cmd
}
res
=
$?
if
[[
$res
!=
0
]]
;
then
echo
"Failing because environment is not properly configured"
echo
"Possible fix: sudo apt-get install libaio-dev"
exit
1
fi
}
validate_environment
if
[[
$#
-ne
3
]]
;
then
echo
"Usage:
$0
<write size in MB> <write dir ><output log dir>"
exit
1
fi
SIZE
=
"
$1M
"
WRITE_DIR
=
$2
LOG_DIR
=
$3
/aio_perf_sweep
OUTPUT_FILE
=
${
WRITE_DIR
}
/ds_aio_write_
${
SIZE
}
B.pt
WRITE_OPT
=
"--write_file
${
OUTPUT_FILE
}
--write_size
${
SIZE
}
"
prep_folder
${
WRITE_DIR
}
prep_folder
${
LOG_DIR
}
RUN_SCRIPT
=
./test_ds_aio.py
DISABLE_CACHE
=
"sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
SYNC
=
"sync"
for
sub
in
single block
;
do
if
[[
$sub
==
"single"
]]
;
then
sub_opt
=
"--single_submit"
else
sub_opt
=
""
fi
for
ov
in
overlap sequential
;
do
if
[[
$ov
==
"overlap"
]]
;
then
ov_opt
=
"--overlap_events"
else
ov_opt
=
""
fi
for
t
in
1 2 4 8
;
do
for
p
in
1
;
do
for
d
in
1 2 4 8 16 32
;
do
for
bs
in
128K 256K 512K 1M
;
do
SCHED_OPTS
=
"
${
sub_opt
}
${
ov_opt
}
--handle --threads
${
t
}
"
OPTS
=
"--io_parallel
${
p
}
--queue_depth
${
d
}
--block_size
${
bs
}
"
LOG
=
"
${
LOG_DIR
}
/write_
${
sub
}
_
${
ov
}
_t
${
t
}
_p
${
p
}
_d
${
d
}
_bs
${
bs
}
.txt"
cmd
=
"python
${
RUN_SCRIPT
}
${
WRITE_OPT
}
${
OPTS
}
${
SCHED_OPTS
}
&>
${
LOG
}
"
echo
${
DISABLE_CACHE
}
echo
${
cmd
}
echo
${
SYNC
}
eval
${
DISABLE_CACHE
}
eval
${
cmd
}
eval
${
SYNC
}
sleep
2
done
done
done
done
done
done
csrc/aio/py_test/single_process_config.json
0 → 100644
View file @
7d1a83a9
{
"block_size"
:
[
"128K"
,
"256K"
,
"1M"
],
"queue_depth"
:
[
4
,
16
,
32
],
"io_parallel"
:
[
1
,
2
,
4
,
8
],
"single_submit"
:
[
true
,
false
],
"overlap_events"
:
[
true
,
false
],
"threads"
:
[
1
]
}
csrc/aio/py_test/test_ds_aio.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
import
torch
import
argparse
import
time
import
sys
from
multiprocessing
import
Pool
import
multiprocessing
as
mp
from
ds_aio_basic
import
aio_basic_multiprocessing
from
ds_aio_handle
import
aio_handle_multiprocessing
from
test_ds_aio_utils
import
refine_args
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--read_file'
,
type
=
str
,
default
=
None
,
help
=
'Read file.'
)
parser
.
add_argument
(
'--write_file'
,
type
=
str
,
default
=
None
,
help
=
'Write file.'
)
parser
.
add_argument
(
'--write_size'
,
type
=
str
,
default
=
None
,
help
=
'Number of bytes to write.'
)
parser
.
add_argument
(
'--block_size'
,
type
=
str
,
default
=
'1M'
,
help
=
'I/O block size.'
)
parser
.
add_argument
(
'--queue_depth'
,
type
=
int
,
default
=
32
,
help
=
'I/O queue depth.'
)
parser
.
add_argument
(
'--threads'
,
type
=
int
,
default
=
1
,
help
=
'Thread parallelism count.'
)
parser
.
add_argument
(
'--single_submit'
,
action
=
'store_true'
,
help
=
'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
)
parser
.
add_argument
(
'--overlap_events'
,
action
=
'store_true'
,
help
=
'Overlap I/O submission and completion requests.'
)
parser
.
add_argument
(
'--validate'
,
action
=
'store_true'
,
help
=
'Perform validation in library.'
)
parser
.
add_argument
(
'--handle'
,
action
=
'store_true'
,
help
=
'Use AIO handle.'
)
parser
.
add_argument
(
'--loops'
,
type
=
int
,
default
=
1
,
help
=
'Count of operation repetitions'
)
parser
.
add_argument
(
'--io_parallel'
,
type
=
int
,
default
=
None
,
help
=
'Per iop parallelism'
)
parser
.
add_argument
(
'--gpu'
,
action
=
'store_true'
,
help
=
'Use GPU memory'
)
args
=
parser
.
parse_args
()
print
(
f
'args =
{
args
}
'
)
return
args
def
validate_args
(
args
):
if
args
.
read_file
and
not
os
.
path
.
isfile
(
args
.
read_file
):
print
(
f
'args validation error:
{
args
.
read_file
}
not found'
)
return
False
return
True
def
main
():
print
(
f
'Testing deepspeed_aio python frontend'
)
args
=
parse_arguments
()
refine_args
(
args
)
if
not
validate_args
(
args
):
quit
()
mp
.
set_start_method
(
'spawn'
)
multiprocess_function
=
aio_handle_multiprocessing
if
args
.
handle
else
aio_basic_multiprocessing
if
args
.
read_file
:
multiprocess_function
(
args
,
True
)
if
args
.
write_file
:
multiprocess_function
(
args
,
False
)
if
__name__
==
"__main__"
:
main
()
csrc/aio/py_test/test_ds_aio_utils.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
BYTES_PER_GB
=
1024
**
3
LOG_TIDS
=
[
0
]
def
task_log
(
tid
,
msg
):
if
tid
in
LOG_TIDS
:
print
(
f
'tid
{
tid
}
:
{
msg
}
'
)
def
task_barrier
(
barrier
,
num_parties
):
assert
barrier
.
parties
==
num_parties
barrier
.
wait
()
assert
barrier
.
broken
==
False
def
report_results
(
args
,
read_op
,
pool_results
):
#print(f'pool_results = {pool_results}')
io_string
=
'Read'
if
read_op
else
'Write'
if
None
in
pool_results
:
print
(
f
'Failure in one of
{
args
.
threads
}
{
io_string
}
processes'
)
return
total_bytes
=
sum
([
num_bytes
for
_
,
_
,
num_bytes
in
pool_results
])
task_latency_sec
=
max
([
sec
for
_
,
sec
,
_
in
pool_results
])
task_speed_GB
=
total_bytes
/
task_latency_sec
/
BYTES_PER_GB
print
(
f
'Task
{
io_string
}
Latency =
{
task_latency_sec
}
sec'
)
print
(
f
'Task
{
io_string
}
Speed =
{
task_speed_GB
}
GB/sec'
)
e2e_latency_sec
=
max
([
sec
for
sec
,
_
,
_
in
pool_results
])
e2e_speed_GB
=
total_bytes
/
e2e_latency_sec
/
BYTES_PER_GB
print
(
f
'E2E
{
io_string
}
Latency =
{
e2e_latency_sec
}
sec'
)
print
(
f
'E2E
{
io_string
}
Speed =
{
e2e_speed_GB
}
GB/sec'
)
def
refine_integer_value
(
value
):
unit_dict
=
{
'K'
:
1024
,
'M'
:
1024
**
2
,
'G'
:
1024
**
3
}
if
value
[
-
1
]
in
list
(
unit_dict
.
keys
()):
int_value
=
int
(
value
[:
-
1
])
*
unit_dict
[
value
[
-
1
]]
return
int_value
return
int
(
value
)
def
refine_args
(
args
):
if
args
.
write_size
and
type
(
args
.
write_size
)
==
str
:
args
.
write_size
=
refine_integer_value
(
args
.
write_size
)
if
args
.
block_size
and
type
(
args
.
block_size
)
==
str
:
args
.
block_size
=
refine_integer_value
(
args
.
block_size
)
csrc/aio/py_test/validate_async_io.py
0 → 100644
View file @
7d1a83a9
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
deepspeed
from
deepspeed.ops.aio
import
AsyncIOBuilder
assert
AsyncIOBuilder
().
is_compatible
()
csrc/common/custom_cuda_kernel.cu
0 → 100644
View file @
7d1a83a9
#include "custom_cuda_layers.h"
__global__
void
param_update_kernel
(
const
float
*
input
,
__half
*
output
,
int
size
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
<
size
)
{
output
[
id
]
=
(
__half
)
input
[
id
];
}
}
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
cudaStream_t
stream
)
{
int
threads
=
1024
;
dim3
grid_dim
((
size
-
1
)
/
threads
+
1
);
dim3
block_dim
(
threads
);
param_update_kernel
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
input
,
output
,
size
);
}
__global__
void
param_update_kernel_half
(
const
float
*
input
,
__half
*
output
,
int
size
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
__half2
*
output_cast
=
reinterpret_cast
<
__half2
*>
(
output
);
if
(
id
<
size
)
{
float
input_f
=
input
[
id
];
__half2
*
input_h
=
reinterpret_cast
<
__half2
*>
(
&
input_f
);
output_cast
[
id
]
=
*
input_h
;
}
}
void
launch_param_update_half
(
const
float
*
input
,
__half
*
output
,
int
size
,
cudaStream_t
stream
)
{
int
threads
=
1024
;
size
/=
2
;
dim3
grid_dim
((
size
-
1
)
/
threads
+
1
);
dim3
block_dim
(
threads
);
param_update_kernel_half
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
input
,
output
,
size
);
}
csrc/common/custom_hip_kernel.hip
0 → 100644
View file @
7d1a83a9
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
__global__ void param_update_kernel(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < size) { output[id] = (__half)input[id]; }
}
void launch_param_update(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
__global__ void param_update_kernel_half(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
__half2* output_cast = reinterpret_cast<__half2*>(output);
if (id < size) {
float input_f = input[id];
__half2* input_h = reinterpret_cast<__half2*>(&input_f);
output_cast[id] = *input_h;
}
}
void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
size /= 2;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel_half), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
csrc/includes/Timer.h
View file @
7d1a83a9
csrc/includes/Timer_hip.h
0 → 100644
View file @
7d1a83a9
// !!! This is a file automatically generated by hipify!!!
#ifndef __TIMER_H__
#define __TIMER_H__
#include <hip/hip_runtime.h>
#include <chrono>
#include "hip/hip_runtime.h"
class
GPUTimer
{
hipEvent_t
start
,
stop
;
public:
GPUTimer
()
{
hipEventCreate
(
&
start
);
hipEventCreate
(
&
stop
);
}
~
GPUTimer
()
{
hipEventDestroy
(
start
);
hipEventDestroy
(
stop
);
}
inline
void
Record
()
{
hipEventRecord
(
start
);
}
inline
void
Elapsed
(
float
&
time_elapsed
)
{
hipEventRecord
(
stop
);
hipEventSynchronize
(
stop
);
hipEventElapsedTime
(
&
time_elapsed
,
start
,
stop
);
}
};
class
CPUTimer
{
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
public:
CPUTimer
()
:
start
(
std
::
chrono
::
high_resolution_clock
::
now
())
{}
inline
void
Reset
()
{
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
inline
float
Elapsed
()
{
auto
temp
=
start
;
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
return
(
float
)(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
start
-
temp
).
count
()
/
1e3
);
}
};
#endif
Prev
1
2
3
4
5
6
7
…
38
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment