Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
67ea635f
Commit
67ea635f
authored
Mar 30, 2023
by
aiss
Browse files
push dsv0.8.2 version
parent
1b2721ad
Pipeline
#201
failed with stages
in 0 seconds
Changes
339
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2236 deletions
+0
-2236
deepspeed/ops/csrc/aio/common/deepspeed_aio_types.cpp
deepspeed/ops/csrc/aio/common/deepspeed_aio_types.cpp
+0
-74
deepspeed/ops/csrc/aio/common/deepspeed_aio_types.h
deepspeed/ops/csrc/aio/common/deepspeed_aio_types.h
+0
-57
deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.cpp
deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.cpp
+0
-123
deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.h
deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.h
+0
-77
deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.cpp
deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.cpp
+0
-84
deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.h
deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.h
+0
-57
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.cpp
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.cpp
+0
-121
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.h
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.h
+0
-27
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+0
-282
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.h
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+0
-68
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.cpp
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.cpp
+0
-133
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.h
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.h
+0
-42
deepspeed/ops/csrc/aio/py_lib/py_ds_aio.cpp
deepspeed/ops/csrc/aio/py_lib/py_ds_aio.cpp
+0
-41
deepspeed/ops/csrc/aio/py_test/aio_bench_generate_param.py
deepspeed/ops/csrc/aio/py_test/aio_bench_generate_param.py
+0
-96
deepspeed/ops/csrc/aio/py_test/aio_bench_perf_sweep.py
deepspeed/ops/csrc/aio/py_test/aio_bench_perf_sweep.py
+0
-397
deepspeed/ops/csrc/aio/py_test/ds_aio_basic.py
deepspeed/ops/csrc/aio/py_test/ds_aio_basic.py
+0
-144
deepspeed/ops/csrc/aio/py_test/ds_aio_handle.py
deepspeed/ops/csrc/aio/py_test/ds_aio_handle.py
+0
-176
deepspeed/ops/csrc/aio/py_test/parse_aio_stats.py
deepspeed/ops/csrc/aio/py_test/parse_aio_stats.py
+0
-154
deepspeed/ops/csrc/aio/py_test/perf_sweep_utils.py
deepspeed/ops/csrc/aio/py_test/perf_sweep_utils.py
+0
-8
deepspeed/ops/csrc/aio/py_test/run_read_sweep.sh
deepspeed/ops/csrc/aio/py_test/run_read_sweep.sh
+0
-75
No files found.
Too many changes to show.
To preserve performance only
339 of 339+
files are displayed.
Plain diff
Email patch
deepspeed/ops/csrc/aio/common/deepspeed_aio_types.cpp
deleted
100644 → 0
View file @
1b2721ad
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <cmath>
#include "deepspeed_aio_utils.h"
using
namespace
std
;
const
int
c_block_size
=
128
*
1024
;
const
int
c_io_queue_depth
=
8
;
deepspeed_aio_config_t
::
deepspeed_aio_config_t
()
:
_block_size
(
c_block_size
),
_queue_depth
(
c_io_queue_depth
),
_single_submit
(
false
),
_overlap_events
(
false
),
_lock_memory
(
false
)
{
}
deepspeed_aio_config_t
::
deepspeed_aio_config_t
(
const
int
block_size
,
const
int
queue_depth
,
const
bool
single_submit
,
const
bool
overlap_events
,
const
bool
lock_memory
)
:
_block_size
(
block_size
),
_queue_depth
(
queue_depth
),
_single_submit
(
single_submit
),
_overlap_events
(
overlap_events
),
_lock_memory
(
lock_memory
)
{
}
void
deepspeed_aio_latency_t
::
dump
(
const
std
::
string
tag
)
{
std
::
cout
<<
tag
<<
_min_usec
<<
" "
<<
_max_usec
<<
" "
<<
_avg_usec
<<
" "
<<
std
::
endl
;
}
void
deepspeed_aio_latency_t
::
accumulate
(
const
struct
deepspeed_aio_latency_t
&
other
)
{
_min_usec
+=
other
.
_min_usec
;
_max_usec
+=
other
.
_max_usec
;
_avg_usec
+=
other
.
_avg_usec
;
}
void
deepspeed_aio_latency_t
::
scale
(
const
float
scaler
)
{
_min_usec
*=
scaler
;
_max_usec
*=
scaler
;
_avg_usec
*=
scaler
;
}
aio_context
::
aio_context
(
const
int
block_size
,
const
int
queue_depth
)
{
_block_size
=
block_size
;
_queue_depth
=
queue_depth
;
for
(
auto
i
=
0
;
i
<
queue_depth
;
++
i
)
{
_iocbs
.
push_back
((
struct
iocb
*
)
calloc
(
1
,
sizeof
(
struct
iocb
)));
}
_io_events
.
resize
(
queue_depth
);
io_queue_init
(
queue_depth
,
&
_io_ctxt
);
}
aio_context
::~
aio_context
()
{
for
(
auto
&
iocb
:
_iocbs
)
{
free
(
iocb
);
}
_io_events
.
resize
(
0
);
io_queue_release
(
_io_ctxt
);
}
deepspeed/ops/csrc/aio/common/deepspeed_aio_types.h
deleted
100644 → 0
View file @
1b2721ad
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <libaio.h>
#include <stdlib.h>
#include <string>
#include <vector>
using
namespace
std
;
struct
deepspeed_aio_latency_t
{
double
_min_usec
;
double
_max_usec
;
double
_avg_usec
;
void
dump
(
const
std
::
string
tag
);
void
accumulate
(
const
deepspeed_aio_latency_t
&
);
void
scale
(
const
float
value
);
};
struct
deepspeed_aio_perf_t
{
deepspeed_aio_latency_t
_submit
;
deepspeed_aio_latency_t
_complete
;
double
_e2e_usec
;
double
_e2e_rate_GB
;
};
struct
deepspeed_aio_config_t
{
const
int
_block_size
;
const
int
_queue_depth
;
const
bool
_single_submit
;
const
bool
_overlap_events
;
const
bool
_lock_memory
;
deepspeed_aio_config_t
();
deepspeed_aio_config_t
(
const
int
block_size
,
const
int
queue_depth
,
const
bool
single_submit
,
const
bool
overlap_events
,
const
bool
lock_memory
);
};
struct
aio_context
{
io_context_t
_io_ctxt
;
std
::
vector
<
struct
io_event
>
_io_events
;
std
::
vector
<
struct
iocb
*>
_iocbs
;
int
_block_size
;
int
_queue_depth
;
aio_context
(
const
int
block_size
,
const
int
queue_depth
);
~
aio_context
();
};
deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.cpp
deleted
100644 → 0
View file @
1b2721ad
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <cmath>
#include "deepspeed_aio_utils.h"
using
namespace
std
;
const
int
c_block_size
=
128
*
1024
;
const
int
c_io_queue_depth
=
8
;
io_xfer_ctxt
::
io_xfer_ctxt
(
const
int
fd
,
const
long
long
int
file_offset
,
const
long
long
int
num_bytes
,
const
void
*
buffer
)
:
_fd
(
fd
),
_base_offset
(
file_offset
),
_mem_buffer
(
buffer
),
_num_bytes
(
num_bytes
)
{
}
io_prep_context
::
io_prep_context
(
const
bool
read_op
,
const
std
::
unique_ptr
<
io_xfer_ctxt
>&
xfer_ctxt
,
const
size_t
block_size
,
const
std
::
vector
<
struct
iocb
*>*
iocbs
)
:
_read_op
(
read_op
),
_xfer_ctxt
(
xfer_ctxt
),
_block_size
(
block_size
),
_iocbs
(
iocbs
)
{
}
void
io_prep_context
::
prep_iocbs
(
const
int
n_iocbs
,
const
size_t
num_bytes
,
const
void
*
start_buffer
,
const
long
long
int
start_offset
)
{
assert
(
static_cast
<
size_t
>
(
n_iocbs
)
<=
_iocbs
->
size
());
for
(
auto
i
=
0
;
i
<
n_iocbs
;
++
i
)
{
const
auto
shift
=
i
*
_block_size
;
const
auto
xfer_buffer
=
(
char
*
)
start_buffer
+
_xfer_ctxt
->
_base_offset
+
shift
;
const
auto
xfer_offset
=
_xfer_ctxt
->
_base_offset
+
start_offset
+
shift
;
auto
byte_count
=
_block_size
;
if
((
shift
+
_block_size
)
>
num_bytes
)
{
byte_count
=
num_bytes
-
shift
;
}
if
(
_read_op
)
{
io_prep_pread
(
_iocbs
->
at
(
i
),
_xfer_ctxt
->
_fd
,
xfer_buffer
,
byte_count
,
xfer_offset
);
}
else
{
io_prep_pwrite
(
_iocbs
->
at
(
i
),
_xfer_ctxt
->
_fd
,
xfer_buffer
,
byte_count
,
xfer_offset
);
}
}
}
io_prep_generator
::
io_prep_generator
(
const
bool
read_op
,
const
std
::
unique_ptr
<
io_xfer_ctxt
>&
xfer_ctxt
,
const
size_t
block_size
)
:
_read_op
(
read_op
),
_xfer_ctxt
(
xfer_ctxt
),
_block_size
(
block_size
),
_remaining_bytes
(
xfer_ctxt
->
_num_bytes
),
_next_iocb_index
(
0
)
{
_num_io_blocks
=
static_cast
<
long
long
int
>
(
ceil
(
static_cast
<
double
>
(
xfer_ctxt
->
_num_bytes
)
/
block_size
));
_remaining_io_blocks
=
_num_io_blocks
;
}
int
io_prep_generator
::
prep_iocbs
(
const
int
n_iocbs
,
std
::
vector
<
struct
iocb
*>*
iocbs
)
{
if
((
_remaining_bytes
)
==
0
||
(
_remaining_io_blocks
==
0
))
{
assert
(
static_cast
<
long
long
int
>
(
_remaining_bytes
)
==
_remaining_io_blocks
);
return
0
;
}
assert
(
static_cast
<
size_t
>
(
n_iocbs
)
<=
iocbs
->
size
());
auto
actual_n_iocbs
=
min
(
static_cast
<
long
long
int
>
(
n_iocbs
),
_remaining_io_blocks
);
for
(
auto
i
=
0
;
i
<
actual_n_iocbs
;
++
i
,
++
_next_iocb_index
)
{
const
auto
xfer_offset
=
_xfer_ctxt
->
_base_offset
+
(
_next_iocb_index
*
_block_size
);
const
auto
xfer_buffer
=
(
char
*
)
_xfer_ctxt
->
_mem_buffer
+
xfer_offset
;
const
auto
num_bytes
=
min
(
static_cast
<
long
long
int
>
(
_block_size
),
_remaining_bytes
);
if
(
_read_op
)
{
io_prep_pread
(
iocbs
->
at
(
i
),
_xfer_ctxt
->
_fd
,
xfer_buffer
,
num_bytes
,
xfer_offset
);
}
else
{
io_prep_pwrite
(
iocbs
->
at
(
i
),
_xfer_ctxt
->
_fd
,
xfer_buffer
,
num_bytes
,
xfer_offset
);
}
_remaining_bytes
-=
num_bytes
;
}
_remaining_io_blocks
-=
actual_n_iocbs
;
return
actual_n_iocbs
;
}
int
get_file_size
(
const
char
*
filename
,
long
long
int
&
size
)
{
struct
stat
st
;
if
(
stat
(
filename
,
&
st
)
==
-
1
)
{
return
-
1
;
}
size
=
st
.
st_size
;
return
0
;
}
void
*
ds_page_aligned_alloc
(
const
size_t
size
,
const
bool
lock
)
{
void
*
ptr
;
int
retval
;
retval
=
posix_memalign
(
&
ptr
,
(
size_t
)
sysconf
(
_SC_PAGESIZE
),
size
);
if
(
retval
)
{
return
nullptr
;
}
if
(
lock
==
false
)
{
return
ptr
;
}
auto
mlock_ret
=
mlock
(
ptr
,
size
);
if
(
mlock_ret
!=
0
)
{
auto
mlock_error
=
errno
;
printf
(
"mlock failed with %d %s
\n
"
,
mlock_error
,
strerror
(
mlock_error
));
free
(
ptr
);
return
nullptr
;
}
return
ptr
;
}
deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.h
deleted
100644 → 0
View file @
1b2721ad
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#pragma once
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <libaio.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <deepspeed_aio_types.h>
#include <cstring>
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
struct
io_xfer_ctxt
{
const
int
_fd
;
const
long
long
int
_base_offset
;
const
void
*
_mem_buffer
;
const
long
long
int
_num_bytes
;
io_xfer_ctxt
(
const
int
fd
,
const
long
long
int
file_offset
,
const
long
long
int
num_bytes
,
const
void
*
buffer
);
};
struct
io_prep_context
{
const
bool
_read_op
;
const
std
::
unique_ptr
<
io_xfer_ctxt
>&
_xfer_ctxt
;
const
size_t
_block_size
;
const
std
::
vector
<
struct
iocb
*>*
_iocbs
;
io_prep_context
(
const
bool
read_op
,
const
std
::
unique_ptr
<
io_xfer_ctxt
>&
xfer_ctxt
,
const
size_t
block_size
,
const
std
::
vector
<
struct
iocb
*>*
iocbs
);
void
prep_iocbs
(
const
int
n_iocbs
,
const
size_t
num_bytes
,
const
void
*
start_buffer
,
const
long
long
int
start_offset
);
};
struct
io_prep_generator
{
const
bool
_read_op
;
const
std
::
unique_ptr
<
io_xfer_ctxt
>&
_xfer_ctxt
;
const
size_t
_block_size
;
long
long
int
_remaining_bytes
;
long
long
int
_num_io_blocks
;
long
long
int
_remaining_io_blocks
;
long
long
int
_next_iocb_index
;
io_prep_generator
(
const
bool
read_op
,
const
std
::
unique_ptr
<
io_xfer_ctxt
>&
xfer_ctxt
,
const
size_t
block_size
);
int
prep_iocbs
(
const
int
n_iocbs
,
std
::
vector
<
struct
iocb
*>*
iocbs
);
};
void
*
ds_page_aligned_alloc
(
const
size_t
size
,
const
bool
lock
=
false
);
int
get_file_size
(
const
char
*
filename
,
long
long
int
&
size
);
deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.cpp
deleted
100644 → 0
View file @
1b2721ad
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include "deepspeed_aio_thread.h"
using
namespace
std
;
io_op_desc_t
::
io_op_desc_t
(
const
bool
read_op
,
const
torch
::
Tensor
&
buffer
,
const
int
fd
,
const
char
*
filename
,
const
long
long
int
num_bytes
,
const
bool
validate
)
:
_read_op
(
read_op
),
_buffer
(
buffer
),
_fd
(
fd
),
_filename
(
filename
),
_num_bytes
(
num_bytes
),
_validate
(
validate
)
{
_cpu_buffer
=
_buffer
.
is_cuda
()
?
_buffer
.
to
(
torch
::
kCPU
).
pin_memory
()
:
_buffer
;
_contiguous_buffer
=
_cpu_buffer
.
contiguous
();
}
char
*
io_op_desc_t
::
data_ptr
()
const
{
return
(
char
*
)
_contiguous_buffer
.
data_ptr
();
}
void
io_op_desc_t
::
fini
()
{
if
(
_read_op
&&
_buffer
.
is_cuda
())
{
_buffer
.
copy_
(
_cpu_buffer
.
to
(
torch
::
kCUDA
));
}
}
deepspeed_aio_thread_t
::
deepspeed_aio_thread_t
(
const
int
tid
,
deepspeed_aio_config_t
&
aio_config
)
:
_tid
(
tid
),
_aio_config
(
aio_config
),
_aio_ctxt
(
new
aio_context
(
aio_config
.
_block_size
,
aio_config
.
_queue_depth
)),
_time_to_exit
(
false
)
{
}
deepspeed_aio_thread_t
::~
deepspeed_aio_thread_t
()
{}
void
deepspeed_aio_thread_t
::
run
()
{
while
(
true
)
{
std
::
shared_ptr
<
struct
io_op_desc_t
>
next_io_op
=
nullptr
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
_work_sync
.
_mutex
);
_work_sync
.
_cond_var
.
wait
(
lock
,
[
this
]
{
return
(
!
_work_queue
.
empty
()
||
_time_to_exit
);
});
if
(
!
_work_queue
.
empty
())
{
next_io_op
=
_work_queue
.
front
();
_work_queue
.
pop
();
}
}
if
(
next_io_op
)
{
const
auto
base_offset
=
next_io_op
->
_num_bytes
*
_tid
;
std
::
unique_ptr
<
io_xfer_ctxt
>
xfer_ctxt
(
new
io_xfer_ctxt
(
next_io_op
->
_fd
,
base_offset
,
next_io_op
->
_num_bytes
,
next_io_op
->
data_ptr
()));
if
(
_aio_config
.
_overlap_events
)
{
do_aio_operation_overlap
(
next_io_op
->
_read_op
,
_aio_ctxt
,
xfer_ctxt
,
&
_aio_config
,
nullptr
);
}
else
{
do_aio_operation_sequential
(
next_io_op
->
_read_op
,
_aio_ctxt
,
xfer_ctxt
,
&
_aio_config
,
nullptr
);
}
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
_complete_sync
.
_mutex
);
_complete_queue
.
push
(
next_io_op
);
}
_complete_sync
.
_cond_var
.
notify_one
();
}
if
(
_time_to_exit
)
{
break
;
}
}
}
deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.h
deleted
100644 → 0
View file @
1b2721ad
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <condition_variable>
#include <memory>
#include <queue>
#include "deepspeed_py_aio.h"
struct
io_op_desc_t
{
const
bool
_read_op
;
torch
::
Tensor
_buffer
;
int
_fd
;
const
std
::
string
_filename
;
const
long
long
int
_num_bytes
;
torch
::
Tensor
_cpu_buffer
;
torch
::
Tensor
_contiguous_buffer
;
const
bool
_validate
;
io_op_desc_t
(
const
bool
read_op
,
const
torch
::
Tensor
&
buffer
,
const
int
fd
,
const
char
*
filename
,
const
long
long
int
num_bytes
,
const
bool
validate
);
char
*
data_ptr
()
const
;
void
fini
();
};
struct
thread_sync_t
{
std
::
mutex
_mutex
;
std
::
condition_variable
_cond_var
;
};
struct
deepspeed_aio_thread_t
{
const
int
_tid
;
deepspeed_aio_config_t
&
_aio_config
;
std
::
unique_ptr
<
struct
aio_context
>
_aio_ctxt
;
std
::
queue
<
std
::
shared_ptr
<
struct
io_op_desc_t
>>
_work_queue
;
std
::
queue
<
std
::
shared_ptr
<
struct
io_op_desc_t
>>
_complete_queue
;
bool
_time_to_exit
;
struct
thread_sync_t
_work_sync
;
struct
thread_sync_t
_complete_sync
;
deepspeed_aio_thread_t
(
const
int
tid
,
deepspeed_aio_config_t
&
aio_config
);
~
deepspeed_aio_thread_t
();
void
run
();
};
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.cpp
deleted
100644 → 0
View file @
1b2721ad
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <cassert>
#include <chrono>
#include <cstring>
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include "deepspeed_py_aio.h"
using
namespace
std
;
using
namespace
std
::
chrono
;
#define DEBUG_DS_AIO_READ 0
#define DEBUG_DS_AIO_WRITE 0
static
const
std
::
string
c_library_name
=
"deepspeed_aio"
;
int
deepspeed_py_aio_write
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
int
block_size
,
const
int
queue_depth
,
const
bool
single_submit
,
const
bool
overlap_events
,
const
bool
validate
)
{
const
auto
start_time
=
std
::
chrono
::
high_resolution_clock
::
now
();
deepspeed_aio_config_t
config
(
block_size
,
queue_depth
,
single_submit
,
overlap_events
,
false
);
const
auto
fd
=
open_file
(
filename
,
false
);
if
(
fd
==
-
1
)
{
return
-
1
;
}
auto
write_buffer
=
(
char
*
)
buffer
.
data_ptr
();
const
auto
num_write_bytes
=
static_cast
<
long
long
int
>
(
buffer
.
nbytes
());
std
::
unique_ptr
<
io_xfer_ctxt
>
xfer_ctxt
(
new
io_xfer_ctxt
(
fd
,
0
,
num_write_bytes
,
write_buffer
));
std
::
unique_ptr
<
aio_context
>
aio_ctxt
(
new
aio_context
(
config
.
_block_size
,
config
.
_queue_depth
));
if
(
config
.
_overlap_events
)
{
do_aio_operation_overlap
(
false
,
aio_ctxt
,
xfer_ctxt
,
&
config
,
nullptr
);
}
else
{
do_aio_operation_sequential
(
false
,
aio_ctxt
,
xfer_ctxt
,
&
config
,
nullptr
);
}
const
std
::
chrono
::
duration
<
double
>
aio_time
=
std
::
chrono
::
high_resolution_clock
::
now
()
-
start_time
;
close
(
fd
);
if
(
validate
)
{
validate_aio_operation
(
false
,
filename
,
write_buffer
,
num_write_bytes
);
}
const
std
::
chrono
::
duration
<
double
>
fn_time
=
std
::
chrono
::
high_resolution_clock
::
now
()
-
start_time
;
std
::
cout
<<
"Elapsed time(usec): "
<<
"aio = "
<<
aio_time
.
count
()
*
1e6
<<
" call = "
<<
fn_time
.
count
()
*
1e6
<<
std
::
endl
;
return
0
;
}
int
deepspeed_py_aio_read
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
int
block_size
,
const
int
queue_depth
,
const
bool
single_submit
,
const
bool
overlap_events
,
const
bool
validate
)
{
const
auto
start_time
=
std
::
chrono
::
high_resolution_clock
::
now
();
long
long
num_file_bytes
;
if
(
-
1
==
get_file_size
(
filename
,
num_file_bytes
))
{
const
auto
error_code
=
errno
;
report_file_error
(
filename
,
" fstat for read"
,
error_code
);
return
-
1
;
}
deepspeed_aio_config_t
config
(
block_size
,
queue_depth
,
single_submit
,
overlap_events
,
false
);
const
auto
fd
=
open_file
(
filename
,
true
);
if
(
fd
==
-
1
)
{
return
-
1
;
}
auto
read_buffer
=
(
char
*
)
buffer
.
data_ptr
();
assert
(
static_cast
<
long
long
int
>
(
buffer
.
nbytes
())
==
num_file_bytes
);
std
::
unique_ptr
<
io_xfer_ctxt
>
xfer_ctxt
(
new
io_xfer_ctxt
(
fd
,
0
,
num_file_bytes
,
read_buffer
));
std
::
unique_ptr
<
aio_context
>
aio_ctxt
(
new
aio_context
(
config
.
_block_size
,
config
.
_queue_depth
));
if
(
config
.
_overlap_events
)
{
do_aio_operation_overlap
(
true
,
aio_ctxt
,
xfer_ctxt
,
&
config
,
nullptr
);
}
else
{
do_aio_operation_sequential
(
true
,
aio_ctxt
,
xfer_ctxt
,
&
config
,
nullptr
);
}
const
std
::
chrono
::
duration
<
double
>
aio_time
=
std
::
chrono
::
high_resolution_clock
::
now
()
-
start_time
;
close
(
fd
);
if
(
validate
)
{
validate_aio_operation
(
true
,
filename
,
read_buffer
,
num_file_bytes
);
}
const
std
::
chrono
::
duration
<
double
>
fn_time
=
std
::
chrono
::
high_resolution_clock
::
now
()
-
start_time
;
std
::
cout
<<
"Elapsed time(usec): "
<<
"aio = "
<<
aio_time
.
count
()
*
1e6
<<
" call = "
<<
fn_time
.
count
()
*
1e6
<<
std
::
endl
;
return
0
;
}
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.h
deleted
100644 → 0
View file @
1b2721ad
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <deepspeed_aio_common.h>
#include <stdlib.h>
#include <torch/extension.h>
int
deepspeed_py_aio_write
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
int
block_size
,
const
int
queue_depth
,
const
bool
single_submit
,
const
bool
overlap_events
,
const
bool
validate
);
int
deepspeed_py_aio_read
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
int
block_size
,
const
int
queue_depth
,
const
bool
single_submit
,
const
bool
overlap_events
,
const
bool
validate
);
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
deleted
100644 → 0
View file @
1b2721ad
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include "deepspeed_py_aio_handle.h"
using
namespace
std
;
static
void
_start_aio_thread
(
std
::
shared_ptr
<
struct
deepspeed_aio_thread_t
>
ctxt
)
{
ctxt
->
run
();
}
deepspeed_aio_handle_t
::
deepspeed_aio_handle_t
(
const
int
block_size
,
const
int
queue_depth
,
const
bool
single_submit
,
const
bool
overlap_events
,
const
int
num_threads
)
:
_aio_ctxt
(
new
aio_context
(
block_size
,
queue_depth
)),
_single_submit
(
single_submit
),
_overlap_events
(
overlap_events
),
_num_threads
(
num_threads
),
_aio_config
(
block_size
,
queue_depth
,
single_submit
,
overlap_events
,
false
),
_num_pending_ops
(
0
)
{
for
(
auto
i
=
0
;
i
<
num_threads
;
++
i
)
{
_thread_contexts
.
push_back
(
std
::
make_shared
<
deepspeed_aio_thread_t
>
(
i
,
_aio_config
));
}
for
(
auto
&
ctxt
:
_thread_contexts
)
{
_threads
.
push_back
(
std
::
thread
(
_start_aio_thread
,
ctxt
));
}
}
deepspeed_aio_handle_t
::~
deepspeed_aio_handle_t
()
{
_stop_threads
();
for
(
auto
&
thr
:
_threads
)
{
thr
.
join
();
}
}
const
int
deepspeed_aio_handle_t
::
get_block_size
()
const
{
return
_aio_ctxt
?
_aio_ctxt
->
_block_size
:
-
1
;
}
const
int
deepspeed_aio_handle_t
::
get_queue_depth
()
const
{
return
_aio_ctxt
?
_aio_ctxt
->
_queue_depth
:
-
1
;
}
const
bool
deepspeed_aio_handle_t
::
get_single_submit
()
const
{
return
_single_submit
;
}
const
bool
deepspeed_aio_handle_t
::
get_overlap_events
()
const
{
return
_overlap_events
;
}
const
int
deepspeed_aio_handle_t
::
get_thread_count
()
const
{
return
_num_threads
;
}
int
deepspeed_aio_handle_t
::
read
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
)
{
const
auto
start_time
=
std
::
chrono
::
high_resolution_clock
::
now
();
assert
(
_aio_ctxt
);
long
long
num_file_bytes
;
if
(
-
1
==
get_file_size
(
filename
,
num_file_bytes
))
{
const
auto
error_code
=
errno
;
report_file_error
(
filename
,
" fstat for read"
,
error_code
);
return
-
1
;
}
assert
(
static_cast
<
long
long
int
>
(
buffer
.
nbytes
())
==
num_file_bytes
);
const
auto
fd
=
open_file
(
filename
,
true
);
if
(
fd
==
-
1
)
{
return
-
1
;
}
auto
read_buffer
=
(
char
*
)
buffer
.
data_ptr
();
std
::
unique_ptr
<
io_xfer_ctxt
>
xfer_ctxt
(
new
io_xfer_ctxt
(
fd
,
0
,
num_file_bytes
,
read_buffer
));
if
(
_aio_config
.
_overlap_events
)
{
do_aio_operation_overlap
(
true
,
_aio_ctxt
,
xfer_ctxt
,
&
_aio_config
,
nullptr
);
}
else
{
do_aio_operation_sequential
(
true
,
_aio_ctxt
,
xfer_ctxt
,
&
_aio_config
,
nullptr
);
}
close
(
fd
);
const
std
::
chrono
::
duration
<
double
>
aio_time
=
std
::
chrono
::
high_resolution_clock
::
now
()
-
start_time
;
if
(
validate
)
{
validate_aio_operation
(
true
,
filename
,
read_buffer
,
num_file_bytes
);
}
const
std
::
chrono
::
duration
<
double
>
fn_time
=
std
::
chrono
::
high_resolution_clock
::
now
()
-
start_time
;
std
::
cout
<<
"Elapsed time(usec): "
<<
"aio = "
<<
aio_time
.
count
()
*
1e6
<<
" call = "
<<
fn_time
.
count
()
*
1e6
<<
std
::
endl
;
return
0
;
}
int
deepspeed_aio_handle_t
::
write
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
)
{
assert
(
_aio_ctxt
);
const
auto
start_time
=
std
::
chrono
::
high_resolution_clock
::
now
();
const
auto
fd
=
open_file
(
filename
,
false
);
if
(
fd
==
-
1
)
{
return
-
1
;
}
auto
write_buffer
=
(
char
*
)
buffer
.
data_ptr
();
const
auto
num_write_bytes
=
static_cast
<
long
long
int
>
(
buffer
.
nbytes
());
std
::
unique_ptr
<
io_xfer_ctxt
>
xfer_ctxt
(
new
io_xfer_ctxt
(
fd
,
0
,
num_write_bytes
,
write_buffer
));
if
(
_aio_config
.
_overlap_events
)
{
do_aio_operation_overlap
(
false
,
_aio_ctxt
,
xfer_ctxt
,
&
_aio_config
,
nullptr
);
}
else
{
do_aio_operation_sequential
(
false
,
_aio_ctxt
,
xfer_ctxt
,
&
_aio_config
,
nullptr
);
}
const
std
::
chrono
::
duration
<
double
>
aio_time
=
std
::
chrono
::
high_resolution_clock
::
now
()
-
start_time
;
close
(
fd
);
if
(
validate
)
{
validate_aio_operation
(
false
,
filename
,
write_buffer
,
num_write_bytes
);
}
const
std
::
chrono
::
duration
<
double
>
fn_time
=
std
::
chrono
::
high_resolution_clock
::
now
()
-
start_time
;
std
::
cout
<<
"Elapsed time(usec): "
<<
"aio = "
<<
aio_time
.
count
()
*
1e6
<<
" call = "
<<
fn_time
.
count
()
*
1e6
<<
std
::
endl
;
return
0
;
}
void
deepspeed_aio_handle_t
::
_schedule_aio_work
(
std
::
shared_ptr
<
struct
io_op_desc_t
>
scheduled_op
)
{
for
(
auto
&
ctxt
:
_thread_contexts
)
{
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
ctxt
->
_work_sync
.
_mutex
);
ctxt
->
_work_queue
.
push
(
scheduled_op
);
}
ctxt
->
_work_sync
.
_cond_var
.
notify_one
();
}
_num_pending_ops
++
;
}
std
::
shared_ptr
<
struct
io_op_desc_t
>
deepspeed_aio_handle_t
::
_wait_for_aio_work
()
{
std
::
shared_ptr
<
struct
io_op_desc_t
>
completed_op
=
nullptr
;
for
(
auto
&
ctxt
:
_thread_contexts
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
ctxt
->
_complete_sync
.
_mutex
);
ctxt
->
_complete_sync
.
_cond_var
.
wait
(
lock
,
[
ctxt
]
{
return
!
ctxt
->
_complete_queue
.
empty
();
});
completed_op
=
ctxt
->
_complete_queue
.
front
();
ctxt
->
_complete_queue
.
pop
();
}
return
completed_op
;
}
void
deepspeed_aio_handle_t
::
_stop_threads
()
{
assert
(
0
==
_num_pending_ops
);
for
(
auto
&
ctxt
:
_thread_contexts
)
{
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
ctxt
->
_work_sync
.
_mutex
);
ctxt
->
_time_to_exit
=
true
;
}
ctxt
->
_work_sync
.
_cond_var
.
notify_one
();
}
}
int
deepspeed_aio_handle_t
::
wait
()
{
assert
(
_num_pending_ops
>
0
);
auto
num_completed_ops
=
0
;
while
(
_num_pending_ops
>
0
)
{
auto
completed_op
=
_wait_for_aio_work
();
completed_op
->
fini
();
close
(
completed_op
->
_fd
);
if
(
completed_op
->
_validate
)
{
validate_aio_operation
(
completed_op
->
_read_op
,
completed_op
->
_filename
.
c_str
(),
completed_op
->
data_ptr
(),
_num_threads
*
completed_op
->
_num_bytes
);
}
--
_num_pending_ops
;
++
num_completed_ops
;
}
return
num_completed_ops
;
}
bool
deepspeed_aio_handle_t
::
_is_valid_parallel_aio_op
(
const
bool
read_op
,
const
long
long
int
num_bytes
)
{
const
auto
op_string
=
read_op
?
"Read"
:
"Write"
;
if
(
num_bytes
%
get_thread_count
())
{
std
::
cout
<<
"deepspeed_aio failure: parallel "
<<
op_string
<<
" num_bytes = "
<<
num_bytes
<<
" not divisible by thread count = "
<<
get_thread_count
()
<<
std
::
endl
;
return
false
;
}
return
true
;
}
int
deepspeed_aio_handle_t
::
pread
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
,
const
bool
async
)
{
long
long
num_file_bytes
;
if
(
-
1
==
get_file_size
(
filename
,
num_file_bytes
))
{
const
auto
error_code
=
errno
;
report_file_error
(
filename
,
" fstat for read"
,
error_code
);
return
-
1
;
}
const
auto
buffer_bytes
=
static_cast
<
long
long
int
>
(
buffer
.
nbytes
());
if
(
buffer_bytes
!=
num_file_bytes
)
{
std
::
cout
<<
filename
<<
": buffer nbytes != file bytes "
<<
buffer_bytes
<<
" != "
<<
num_file_bytes
<<
std
::
endl
;
}
assert
(
static_cast
<
long
long
int
>
(
buffer
.
nbytes
())
==
num_file_bytes
);
assert
((
num_file_bytes
%
_num_threads
)
==
0
);
if
(
!
_is_valid_parallel_aio_op
(
true
,
num_file_bytes
))
{
return
-
1
;
}
const
auto
fd
=
open_file
(
filename
,
true
);
if
(
fd
==
-
1
)
{
return
-
1
;
}
auto
scheduled_op
=
std
::
make_shared
<
io_op_desc_t
>
(
true
,
buffer
,
fd
,
filename
,
(
num_file_bytes
/
_num_threads
),
validate
);
_schedule_aio_work
(
scheduled_op
);
if
(
async
)
{
return
0
;
}
return
wait
();
}
int
deepspeed_aio_handle_t
::
pwrite
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
,
const
bool
async
)
{
const
auto
num_write_bytes
=
static_cast
<
long
long
int
>
(
buffer
.
nbytes
());
assert
((
num_write_bytes
%
_num_threads
)
==
0
);
if
(
!
_is_valid_parallel_aio_op
(
false
,
num_write_bytes
))
{
return
-
1
;
}
const
auto
fd
=
open_file
(
filename
,
false
);
if
(
fd
==
-
1
)
{
return
-
1
;
}
auto
scheduled_op
=
std
::
make_shared
<
io_op_desc_t
>
(
false
,
buffer
,
fd
,
filename
,
(
num_write_bytes
/
_num_threads
),
validate
);
_schedule_aio_work
(
scheduled_op
);
if
(
async
)
{
return
0
;
}
return
wait
();
}
int
deepspeed_aio_handle_t
::
sync_pread
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
)
{
return
pread
(
buffer
,
filename
,
false
,
false
);
}
int
deepspeed_aio_handle_t
::
sync_pwrite
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
)
{
return
pwrite
(
buffer
,
filename
,
false
,
false
);
}
int
deepspeed_aio_handle_t
::
async_pread
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
)
{
return
pread
(
buffer
,
filename
,
false
,
true
);
}
int
deepspeed_aio_handle_t
::
async_pwrite
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
)
{
return
pwrite
(
buffer
,
filename
,
false
,
true
);
}
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.h
deleted
100644 → 0
View file @
1b2721ad
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <condition_variable>
#include <memory>
#include "deepspeed_aio_thread.h"
struct
deepspeed_aio_handle_t
{
std
::
unique_ptr
<
struct
aio_context
>
_aio_ctxt
;
const
bool
_single_submit
;
const
bool
_overlap_events
;
const
int
_num_threads
;
deepspeed_aio_config_t
_aio_config
;
std
::
vector
<
std
::
shared_ptr
<
struct
deepspeed_aio_thread_t
>>
_thread_contexts
;
std
::
vector
<
std
::
thread
>
_threads
;
int
_num_pending_ops
;
deepspeed_aio_handle_t
(
const
int
block_size
,
const
int
queue_depth
,
const
bool
single_submit
,
const
bool
overlap_events
,
const
int
num_threads
);
~
deepspeed_aio_handle_t
();
const
int
get_block_size
()
const
;
const
int
get_queue_depth
()
const
;
const
bool
get_single_submit
()
const
;
const
bool
get_overlap_events
()
const
;
const
int
get_thread_count
()
const
;
int
read
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
);
int
write
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
);
int
pread
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
,
const
bool
async
);
int
pwrite
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
,
const
bool
validate
,
const
bool
async
);
int
sync_pread
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
);
int
sync_pwrite
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
);
int
async_pread
(
torch
::
Tensor
&
buffer
,
const
char
*
filename
);
int
async_pwrite
(
const
torch
::
Tensor
&
buffer
,
const
char
*
filename
);
int
wait
();
void
_stop_threads
();
void
_schedule_aio_work
(
std
::
shared_ptr
<
struct
io_op_desc_t
>
scheduled_op
);
std
::
shared_ptr
<
struct
io_op_desc_t
>
_wait_for_aio_work
();
bool
_is_valid_parallel_aio_op
(
const
bool
read_op
,
const
long
long
int
num_bytes
);
};
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.cpp
deleted
100644 → 0
View file @
1b2721ad
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include "deepspeed_py_copy.h"
#include <omp.h>
#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
#if defined(__AVX512__) or defined(__AVX256__)
union
AVX_Data
{
#if defined(__AVX512__)
__m512
data
;
#else
__m256
data
;
#endif
};
#endif
static
void
helper_memcpy_1
(
float
*
dest
,
float
*
src
,
size_t
param_size
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
rounded_size
=
ROUND_DOWN
(
param_size
,
SIMD_WIDTH
);
for
(
size_t
t
=
0
;
t
<
rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
rounded_size
)
copy_size
=
rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
SIMD_WIDTH
)
{
AVX_Data
src_4
;
src_4
.
data
=
SIMD_LOAD
(
src
+
i
);
SIMD_STORE
(
dest
+
i
,
src_4
.
data
);
}
}
#endif
if
(
param_size
>
rounded_size
)
{
#pragma omp parallel for
for
(
size_t
k
=
rounded_size
;
k
<
param_size
;
k
++
)
{
dest
[
k
]
=
src
[
k
];
}
}
}
static
void
helper_memcpy_4
(
float
*
dest
,
float
*
src
,
size_t
param_size
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
rounded_size
=
ROUND_DOWN
(
param_size
,
(
SIMD_WIDTH
<<
2
));
for
(
size_t
t
=
0
;
t
<
rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
rounded_size
)
copy_size
=
rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
(
SIMD_WIDTH
<<
2
))
{
AVX_Data
src_4
[
4
];
src_4
[
0
].
data
=
SIMD_LOAD
(
src
+
i
);
src_4
[
1
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
);
src_4
[
2
].
data
=
SIMD_LOAD
(
src
+
i
+
(
SIMD_WIDTH
<<
1
));
src_4
[
3
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
3
);
SIMD_STORE
(
dest
+
i
,
src_4
[
0
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
,
src_4
[
1
].
data
);
SIMD_STORE
(
dest
+
i
+
(
SIMD_WIDTH
<<
1
),
src_4
[
2
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
3
,
src_4
[
3
].
data
);
}
}
#endif
if
(
param_size
>
rounded_size
)
helper_memcpy_1
((
dest
+
rounded_size
),
(
src
+
rounded_size
),
(
param_size
-
rounded_size
));
}
static
void
helper_mempcy_8
(
float
*
dest
,
float
*
src
,
size_t
param_size
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
rounded_size
=
ROUND_DOWN
(
param_size
,
(
SIMD_WIDTH
<<
2
));
for
(
size_t
t
=
0
;
t
<
rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
rounded_size
)
copy_size
=
rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
(
SIMD_WIDTH
<<
3
))
{
AVX_Data
src_4
[
8
];
src_4
[
0
].
data
=
SIMD_LOAD
(
src
+
i
);
src_4
[
1
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
);
src_4
[
2
].
data
=
SIMD_LOAD
(
src
+
i
+
(
SIMD_WIDTH
<<
1
));
src_4
[
3
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
3
);
src_4
[
4
].
data
=
SIMD_LOAD
(
src
+
i
+
(
SIMD_WIDTH
<<
2
));
src_4
[
5
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
5
);
src_4
[
6
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
6
);
src_4
[
7
].
data
=
SIMD_LOAD
(
src
+
i
+
SIMD_WIDTH
*
7
);
SIMD_STORE
(
dest
+
i
,
src_4
[
0
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
,
src_4
[
1
].
data
);
SIMD_STORE
(
dest
+
i
+
(
SIMD_WIDTH
<<
1
),
src_4
[
2
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
3
,
src_4
[
3
].
data
);
SIMD_STORE
(
dest
+
i
+
(
SIMD_WIDTH
<<
2
),
src_4
[
4
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
5
,
src_4
[
5
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
6
,
src_4
[
6
].
data
);
SIMD_STORE
(
dest
+
i
+
SIMD_WIDTH
*
7
,
src_4
[
7
].
data
);
}
}
#endif
if
(
param_size
>
rounded_size
)
helper_memcpy_4
((
dest
+
rounded_size
),
(
src
+
rounded_size
),
(
param_size
-
rounded_size
));
}
int
deepspeed_py_memcpy
(
torch
::
Tensor
&
dest
,
const
torch
::
Tensor
&
src
)
{
auto
dest_c
=
dest
.
contiguous
();
auto
src_c
=
src
.
contiguous
();
float
*
dest_ptr
=
(
float
*
)
dest_c
.
data_ptr
();
float
*
src_ptr
=
(
float
*
)
src_c
.
data_ptr
();
helper_mempcy_8
(
dest_ptr
,
src_ptr
,
dest_c
.
size
(
0
));
return
0
;
}
deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.h
deleted
100644 → 0
View file @
1b2721ad
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#if (__x86_64__ || __i386__)
#include <cpuid.h>
#include <x86intrin.h>
#endif
#include <deepspeed_aio_common.h>
#include <stdlib.h>
#include <torch/extension.h>
#define TILE (1024 * 1024 * 1024)
#if defined(__AVX512__)
#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm512_loadu_ps(x)
#define SIMD_SET(x) _mm512_set1_ps(x)
#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
#define SIMD_WIDTH 16
#else
#if defined(__AVX256__)
#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm256_loadu_ps(x)
#define SIMD_SET(x) _mm256_set1_ps(x)
#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
#define SIMD_WIDTH 8
#endif
#endif
int
deepspeed_py_memcpy
(
torch
::
Tensor
&
dest
,
const
torch
::
Tensor
&
src
);
deepspeed/ops/csrc/aio/py_lib/py_ds_aio.cpp
deleted
100644 → 0
View file @
1b2721ad
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <torch/extension.h>
#include "deepspeed_py_aio_handle.h"
#include "deepspeed_py_copy.h"
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"aio_read"
,
&
deepspeed_py_aio_read
,
"DeepSpeed Asynchronous I/O Read"
);
m
.
def
(
"aio_write"
,
&
deepspeed_py_aio_write
,
"DeepSpeed Asynchronous I/O Write"
);
m
.
def
(
"deepspeed_memcpy"
,
&
deepspeed_py_memcpy
,
"DeepSpeed Memory Copy"
);
py
::
class_
<
deepspeed_aio_handle_t
>
(
m
,
"aio_handle"
)
.
def
(
py
::
init
<
const
int
,
const
int
,
const
bool
,
const
bool
,
const
int
>
())
.
def
(
"get_block_size"
,
&
deepspeed_aio_handle_t
::
get_block_size
)
.
def
(
"get_queue_depth"
,
&
deepspeed_aio_handle_t
::
get_queue_depth
)
.
def
(
"get_single_submit"
,
&
deepspeed_aio_handle_t
::
get_single_submit
)
.
def
(
"get_overlap_events"
,
&
deepspeed_aio_handle_t
::
get_overlap_events
)
.
def
(
"get_thread_count"
,
&
deepspeed_aio_handle_t
::
get_thread_count
)
.
def
(
"read"
,
&
deepspeed_aio_handle_t
::
read
)
.
def
(
"write"
,
&
deepspeed_aio_handle_t
::
write
)
.
def
(
"pread"
,
&
deepspeed_aio_handle_t
::
pread
)
.
def
(
"pwrite"
,
&
deepspeed_aio_handle_t
::
pwrite
)
.
def
(
"sync_pread"
,
&
deepspeed_aio_handle_t
::
sync_pread
)
.
def
(
"sync_pwrite"
,
&
deepspeed_aio_handle_t
::
sync_pwrite
)
.
def
(
"async_pread"
,
&
deepspeed_aio_handle_t
::
async_pread
)
.
def
(
"async_pwrite"
,
&
deepspeed_aio_handle_t
::
async_pwrite
)
.
def
(
"wait"
,
&
deepspeed_aio_handle_t
::
wait
);
}
deepspeed/ops/csrc/aio/py_test/aio_bench_generate_param.py
deleted
100644 → 0
View file @
1b2721ad
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
import
argparse
import
json
from
parse_aio_stats
import
READ_SPEED
,
WRITE_SPEED
,
get_sorted_results
from
perf_sweep_utils
import
BENCH_LOG_DIR
,
READ_LOG_DIR
,
WRITE_LOG_DIR
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
default
=
BENCH_LOG_DIR
,
help
=
f
'Folder of performance sweep logs. Default is
{
os
.
path
.
join
(
"."
,
BENCH_LOG_DIR
)
}
'
)
args
=
parser
.
parse_args
()
print
(
f
'args =
{
args
}
'
)
return
args
def
validate_args
(
args
):
for
d
in
[
READ_LOG_DIR
,
WRITE_LOG_DIR
]:
log_dir
=
os
.
path
.
join
(
args
.
log_dir
,
d
)
if
not
os
.
path
.
isdir
(
log_dir
):
print
(
f
'
{
log_dir
}
folder is not existent'
)
return
False
return
True
def
convert_to_param
(
key
):
assert
len
(
key
)
==
6
return
{
"single_submit"
:
"true"
if
key
[
0
]
==
"single"
else
"false"
,
"overlap_events"
:
"true"
if
key
[
1
]
==
"overlap"
else
"false"
,
"thread_count"
:
int
(
key
[
3
]),
"queue_depth"
:
int
(
key
[
4
]),
"block_size"
:
int
(
key
[
5
])
}
def
generate_aio_param
(
read_log_dir
,
write_log_dir
):
_
,
read_results
=
get_sorted_results
(
read_log_dir
,
READ_SPEED
)
_
,
write_results
=
get_sorted_results
(
write_log_dir
,
WRITE_SPEED
)
combined_perf
=
{
key
[
1
:]:
value
for
key
,
value
in
read_results
.
items
()}
for
key
,
value
in
write_results
.
items
():
new_key
=
key
[
1
:]
if
new_key
in
combined_perf
:
combined_perf
[
new_key
]
+=
value
else
:
combined_perf
[
new_key
]
=
0
optimal_key
=
None
optimal_perf
=
0.0
for
key
,
value
in
combined_perf
.
items
():
if
value
>
optimal_perf
:
optimal_perf
=
value
optimal_key
=
key
aio_param
=
{
"aio"
:
convert_to_param
(
optimal_key
)}
read_perf_keys
=
{
key
[
1
:]:
key
for
key
in
read_results
.
keys
()}
write_perf_keys
=
{
key
[
1
:]:
key
for
key
in
write_results
.
keys
()}
optimal_config_read
=
read_results
.
get
(
read_perf_keys
[
optimal_key
],
None
)
optimal_config_write
=
write_results
.
get
(
write_perf_keys
[
optimal_key
],
None
)
print
(
f
'Best performance (GB/sec): read =
{
optimal_config_read
:
5.2
f
}
, write =
{
optimal_config_write
:
5.2
f
}
'
)
print
(
json
.
dumps
(
aio_param
,
indent
=
3
))
def
main
():
print
(
'Generate aio param'
)
args
=
parse_arguments
()
if
not
validate_args
(
args
):
quit
()
read_log_dir
=
os
.
path
.
join
(
args
.
log_dir
,
READ_LOG_DIR
)
write_log_dir
=
os
.
path
.
join
(
args
.
log_dir
,
WRITE_LOG_DIR
)
generate_aio_param
(
read_log_dir
,
write_log_dir
)
if
__name__
==
"__main__"
:
main
()
deepspeed/ops/csrc/aio/py_test/aio_bench_perf_sweep.py
deleted
100644 → 0
View file @
1b2721ad
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
import
sys
import
argparse
import
json
import
itertools
import
subprocess
import
shutil
from
test_ds_aio_utils
import
refine_integer_value
from
perf_sweep_utils
import
READ_OP_DESC
,
WRITE_OP_DESC
,
BENCH_LOG_DIR
,
\
READ_IO_DIR
,
WRITE_IO_DIR
,
READ_LOG_DIR
,
WRITE_LOG_DIR
OTHER_OPTIONS
=
'--handle'
PERF_SCRIPT
=
'test_ds_aio.py'
DEFAULT_SWEEP_CONFIG
=
{
"block_size"
:
[
"128K"
,
"256K"
],
"queue_depth"
:
[
4
,
16
,
32
],
"overlap_events"
:
[
True
,
False
],
"io_parallel"
:
[
2
,
8
],
"single_submit"
:
[
False
]
}
class
Job
(
object
):
def
__init__
(
self
,
cmd_line
,
output_file
=
None
,
work_dir
=
None
):
self
.
cmd_line
=
cmd_line
self
.
output_file
=
output_file
self
.
work_dir
=
work_dir
self
.
output_fd
=
None
def
cmd
(
self
):
return
self
.
cmd_line
def
get_stdout
(
self
):
return
self
.
output_fd
def
get_stderr
(
self
):
return
self
.
output_fd
def
get_cwd
(
self
):
return
self
.
work_dir
def
open_output_file
(
self
):
if
self
.
output_file
is
not
None
:
self
.
output_fd
=
open
(
self
.
output_file
,
'w'
)
def
close_output_file
(
self
):
if
self
.
output_fd
is
not
None
:
self
.
output_fd
.
close
()
self
.
output_fd
=
None
class
SweepConfig
(
object
):
def
__init__
(
self
,
args
):
self
.
nvme_dir
=
args
.
nvme_dir
self
.
io_size
=
args
.
io_size
self
.
search_space
=
get_sweep_config_dict
(
args
.
sweep_config
)
self
.
read
=
not
args
.
no_read
self
.
write
=
not
args
.
no_write
self
.
flush_cache
=
not
args
.
no_sudo
self
.
log_dir
=
args
.
log_dir
self
.
loops
=
args
.
loops
self
.
other_options
=
f
'
{
OTHER_OPTIONS
}
--loops
{
args
.
loops
}
'
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--nvme_dir'
,
required
=
True
,
type
=
str
,
help
=
'Directory in which to perform I/O tests. A writeable directory on a NVMe device.'
)
parser
.
add_argument
(
'--sweep_config'
,
type
=
str
,
default
=
None
,
help
=
'Performance sweep configuration json file.'
)
parser
.
add_argument
(
'--no_read'
,
action
=
'store_true'
,
help
=
'Disable read performance measurements.'
)
parser
.
add_argument
(
'--no_write'
,
action
=
'store_true'
,
help
=
'Disable write performance measurements.'
)
parser
.
add_argument
(
'--io_size'
,
type
=
str
,
default
=
"400M"
,
help
=
'Number of I/O bytes to read/write for performance measurements.'
)
parser
.
add_argument
(
'--no_sudo'
,
action
=
'store_true'
,
help
=
'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.'
)
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
default
=
BENCH_LOG_DIR
,
help
=
f
'Output directory for performance log files. Default is
{
os
.
path
.
join
(
"."
,
BENCH_LOG_DIR
)
}
'
)
parser
.
add_argument
(
'--loops'
,
type
=
int
,
default
=
1
,
help
=
'Count of operation repetitions'
)
args
=
parser
.
parse_args
()
print
(
f
'args =
{
args
}
'
)
return
args
def
dump_cmd_lines
(
cmd_lines
):
print
(
f
'cmd line count =
{
len
(
cmd_lines
)
}
'
)
for
i
,
cmd
in
enumerate
(
cmd_lines
):
print
(
f
'
{
i
}
:
{
cmd
}
'
)
def
get_sweep_config_dict
(
sweep_config_json
):
if
sweep_config_json
is
None
:
return
DEFAULT_SWEEP_CONFIG
with
open
(
sweep_config_json
)
as
fp
:
sweep_config
=
json
.
load
(
fp
)
return
sweep_config
def
get_sweep_cmd_lines
(
sweep_config_dict
):
def
flatten_options
(
key
,
value_list
):
flat_list
=
[]
for
v
in
value_list
:
if
not
type
(
v
)
is
bool
:
flat_list
.
append
(
f
'--
{
key
}
{
v
}
'
)
elif
v
:
flat_list
.
append
(
f
'--
{
key
}
'
)
else
:
flat_list
.
append
(
' '
)
return
flat_list
flat_list
=
[
flatten_options
(
key
,
value
)
for
key
,
value
in
sweep_config_dict
.
items
()]
cmd_list
=
list
(
itertools
.
product
(
*
flat_list
))
cmd_list
=
[
list
(
cmd
)
for
cmd
in
cmd_list
]
#dump_cmd_lines(cmd_list)
return
cmd_list
def
run_job
(
job
):
args
=
' '
.
join
(
job
.
cmd
())
print
(
f
'args =
{
args
}
'
)
job
.
open_output_file
()
proc
=
subprocess
.
run
(
args
=
args
,
shell
=
True
,
stdout
=
job
.
get_stdout
(),
stderr
=
job
.
get_stderr
(),
cwd
=
job
.
get_cwd
())
job
.
close_output_file
()
assert
proc
.
returncode
==
0
,
\
f
"This command failed:
{
job
.
cmd
()
}
"
def
launch_sweep
(
sweep_jobs
,
sync_job
,
flush_cache_job
):
for
perf_job
in
sweep_jobs
:
if
flush_cache_job
is
not
None
:
run_job
(
sync_job
)
run_job
(
flush_cache_job
)
run_job
(
perf_job
)
run_job
(
sync_job
)
def
create_cmd_tags
(
cmd_line
):
tags
=
{}
for
param_value
in
cmd_line
:
fields
=
param_value
.
split
()
if
len
(
fields
)
==
1
:
tags
[
fields
[
0
]]
=
None
elif
len
(
fields
)
==
2
:
tags
[
fields
[
0
]]
=
fields
[
1
]
return
tags
def
get_log_file
(
io_op_desc
,
cmd_line
):
QUEUE_DEPTH
=
"--queue_depth"
BLOCK_SIZE
=
"--block_size"
SINGLE_SUBMIT
=
"--single_submit"
OVERLAP_EVENTS
=
"--overlap_events"
THREAD_COUNT
=
"--threads"
IO_PARALLEL
=
"--io_parallel"
tag_map
=
{
QUEUE_DEPTH
:
"d"
,
BLOCK_SIZE
:
"bs"
,
SINGLE_SUBMIT
:
"single"
,
OVERLAP_EVENTS
:
"overlap"
,
THREAD_COUNT
:
"t"
,
IO_PARALLEL
:
"p"
}
tag_default
=
{
QUEUE_DEPTH
:
1
,
BLOCK_SIZE
:
"1M"
,
SINGLE_SUBMIT
:
"block"
,
OVERLAP_EVENTS
:
"sequential"
,
THREAD_COUNT
:
1
,
IO_PARALLEL
:
1
}
def
get_default_value
(
tag
):
value
=
tag_default
[
tag
]
if
tag
in
[
SINGLE_SUBMIT
,
OVERLAP_EVENTS
]:
return
value
return
f
'
{
tag_map
[
tag
]
}{
value
}
'
def
get_config_value
(
tag
,
value
):
tag_key
=
tag_map
[
tag
]
if
value
is
None
:
return
tag_key
return
f
'
{
tag_key
}{
value
}
'
tag_list
=
[
SINGLE_SUBMIT
,
OVERLAP_EVENTS
,
THREAD_COUNT
,
IO_PARALLEL
,
QUEUE_DEPTH
,
BLOCK_SIZE
]
log_tags
=
[
io_op_desc
]
cmd_tags
=
create_cmd_tags
(
cmd_line
)
for
tag
in
tag_list
:
if
tag
in
cmd_tags
:
log_tags
.
append
(
get_config_value
(
tag
,
cmd_tags
[
tag
]))
else
:
log_tags
.
append
(
get_default_value
(
tag
))
log_file
=
'_'
.
join
(
log_tags
)
log_file
+=
'.txt'
return
log_file
def
create_perf_jobs
(
io_op_desc
,
log_dir
,
cmd_lines
):
py_cmd
=
[
'python'
,
os
.
path
.
join
(
script_path
(),
PERF_SCRIPT
)]
perf_jobs
=
[]
for
cmd
in
cmd_lines
:
log_file
=
os
.
path
.
join
(
log_dir
,
get_log_file
(
io_op_desc
,
cmd
))
job
=
Job
(
cmd_line
=
py_cmd
+
cmd
,
output_file
=
log_file
)
perf_jobs
.
append
(
job
)
return
perf_jobs
def
script_path
():
return
os
.
path
.
dirname
(
os
.
path
.
realpath
(
sys
.
argv
[
0
]))
def
async_io_setup
():
import
deepspeed
from
deepspeed.ops.aio
import
AsyncIOBuilder
return
AsyncIOBuilder
().
is_compatible
()
def
get_block_size_and_count
(
io_bytes
):
block_size
=
1
block_count
=
io_bytes
bytes_in_KB
=
1024
while
block_count
%
bytes_in_KB
==
0
:
block_size
*=
bytes_in_KB
block_count
/=
bytes_in_KB
return
int
(
block_size
),
int
(
block_count
)
def
create_read_file
(
sweep_config
):
read_folder
=
os
.
path
.
join
(
sweep_config
.
nvme_dir
,
f
'
{
READ_IO_DIR
}
'
)
os
.
makedirs
(
read_folder
,
exist_ok
=
True
)
read_file_name
=
os
.
path
.
join
(
read_folder
,
f
'random_
{
sweep_config
.
io_size
}
B.pt'
)
block_size
,
block_count
=
get_block_size_and_count
(
refine_integer_value
(
sweep_config
.
io_size
))
dd_job
=
Job
(
cmd_line
=
[
f
'dd if=/dev/urandom of=
{
read_file_name
}
bs=
{
block_size
}
count=
{
block_count
}
'
])
print
(
f
'[Start] Create read file of
{
sweep_config
.
io_size
}
bytes by running
{
dd_job
.
cmd
()
}
....'
)
run_job
(
dd_job
)
print
(
f
'[Done] Create read file of
{
sweep_config
.
io_size
}
bytes by running
{
dd_job
.
cmd
()
}
....'
)
return
read_folder
,
read_file_name
def
remove_folder
(
folder
):
assert
os
.
path
.
isdir
(
folder
),
f
"Error: cannot remove
{
folder
}
- folder not found"
shutil
.
rmtree
(
folder
)
def
run_read_sweep
(
sweep_config
,
flush_cache_job
,
sync_job
,
cmd_lines
):
read_folder
,
read_file_name
=
create_read_file
(
sweep_config
)
read_option
=
f
'--read_file
{
read_file_name
}
'
read_cmd_lines
=
[[
f
'
{
read_option
}
{
sweep_config
.
other_options
}
'
]
+
cmd
for
cmd
in
cmd_lines
]
#dump_cmd_lines(read_cmd_lines)
log_folder
=
os
.
path
.
join
(
sweep_config
.
log_dir
,
f
'
{
READ_LOG_DIR
}
'
)
os
.
makedirs
(
log_folder
,
exist_ok
=
True
)
perf_jobs
=
create_perf_jobs
(
io_op_desc
=
READ_OP_DESC
,
log_dir
=
log_folder
,
cmd_lines
=
read_cmd_lines
)
launch_sweep
(
sweep_jobs
=
perf_jobs
,
sync_job
=
sync_job
,
flush_cache_job
=
flush_cache_job
)
remove_folder
(
read_folder
)
def
run_write_sweep
(
sweep_config
,
flush_cache_job
,
sync_job
,
cmd_lines
):
write_folder
=
os
.
path
.
join
(
sweep_config
.
nvme_dir
,
f
'
{
WRITE_IO_DIR
}
'
)
os
.
makedirs
(
write_folder
,
exist_ok
=
True
)
write_file_name
=
os
.
path
.
join
(
write_folder
,
f
'random_
{
sweep_config
.
io_size
}
B.pt'
)
write_option
=
f
'--write_size
{
sweep_config
.
io_size
}
--write_file
{
write_file_name
}
'
write_cmd_lines
=
[[
f
'
{
write_option
}
{
sweep_config
.
other_options
}
'
]
+
cmd
for
cmd
in
cmd_lines
]
#dump_cmd_lines(write_cmd_lines)
log_folder
=
os
.
path
.
join
(
sweep_config
.
log_dir
,
f
'
{
WRITE_LOG_DIR
}
'
)
os
.
makedirs
(
log_folder
,
exist_ok
=
True
)
perf_jobs
=
create_perf_jobs
(
io_op_desc
=
WRITE_OP_DESC
,
log_dir
=
log_folder
,
cmd_lines
=
write_cmd_lines
)
launch_sweep
(
sweep_jobs
=
perf_jobs
,
sync_job
=
sync_job
,
flush_cache_job
=
flush_cache_job
)
remove_folder
(
write_folder
)
def
main
():
print
(
"Running performance sweep of deepspeed nvme library"
)
if
not
async_io_setup
():
error_msg
=
"""
Failing because environment is not properly configured for deepspeed async i/o module.
Possible fix: apt install libaio-dev.
"""
print
(
error_msg
)
quit
()
args
=
parse_arguments
()
sweep_config
=
SweepConfig
(
args
)
cmd_lines
=
get_sweep_cmd_lines
(
sweep_config
.
search_space
)
if
sweep_config
.
flush_cache
:
flush_cache_job
=
Job
(
cmd_line
=
[
'sudo'
,
'bash -c'
,
"'echo 1 > /proc/sys/vm/drop_caches'"
])
else
:
flush_cache_job
=
None
sync_job
=
Job
(
cmd_line
=
[
'sync'
])
if
sweep_config
.
read
:
run_read_sweep
(
sweep_config
,
flush_cache_job
,
sync_job
,
cmd_lines
)
if
sweep_config
.
write
:
run_write_sweep
(
sweep_config
,
flush_cache_job
,
sync_job
,
cmd_lines
)
if
__name__
==
"__main__"
:
main
()
deepspeed/ops/csrc/aio/py_test/ds_aio_basic.py
deleted
100644 → 0
View file @
1b2721ad
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
torch
import
os
import
time
from
deepspeed.ops.aio
import
AsyncIOBuilder
from
multiprocessing
import
Pool
,
Barrier
from
test_ds_aio_utils
import
report_results
,
task_log
,
task_barrier
def
pre_basic
(
args
,
tid
,
read_op
):
io_string
=
"Read"
if
read_op
else
"Write"
num_bytes
=
os
.
path
.
getsize
(
args
.
read_file
)
if
read_op
else
args
.
write_size
file
=
args
.
read_file
if
read_op
else
f
'
{
args
.
write_file
}
.
{
tid
}
'
task_log
(
tid
,
f
'Allocate tensor of size
{
num_bytes
}
bytes'
)
buffer
=
torch
.
empty
(
num_bytes
,
dtype
=
torch
.
uint8
,
device
=
'cpu'
).
pin_memory
()
task_log
(
tid
,
f
'
{
io_string
}
file
{
file
}
of size
{
num_bytes
}
bytes from buffer on device
{
buffer
.
device
}
'
)
ctxt
=
{}
ctxt
[
'file'
]
=
file
ctxt
[
'num_bytes'
]
=
num_bytes
ctxt
[
'buffer'
]
=
buffer
ctxt
[
'elapsed_sec'
]
=
0
return
ctxt
def
pre_basic_read
(
pool_params
):
args
,
tid
=
pool_params
ctxt
=
pre_basic
(
args
,
tid
,
True
)
return
ctxt
def
pre_basic_write
(
pool_params
):
args
,
tid
=
pool_params
ctxt
=
pre_basic
(
args
,
tid
,
False
)
return
ctxt
def
post_basic
(
pool_params
):
_
,
_
,
ctxt
=
pool_params
ctxt
[
"buffer"
].
detach
()
ctxt
[
"buffer"
]
=
None
return
ctxt
def
main_basic_read
(
pool_params
):
args
,
tid
,
ctxt
=
pool_params
start_time
=
time
.
time
()
AsyncIOBuilder
().
load
().
aio_read
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
block_size
,
args
.
queue_depth
,
args
.
single_submit
,
args
.
overlap_events
,
args
.
validate
)
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
main_basic_write
(
pool_params
):
args
,
tid
,
ctxt
=
pool_params
start_time
=
time
.
time
()
AsyncIOBuilder
().
load
().
aio_write
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
block_size
,
args
.
queue_depth
,
args
.
single_submit
,
args
.
overlap_events
,
args
.
validate
)
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
get_schedule
(
args
,
read_op
):
schedule
=
{}
if
read_op
:
schedule
[
'pre'
]
=
pre_basic_read
schedule
[
'post'
]
=
post_basic
schedule
[
'main'
]
=
main_basic_read
else
:
schedule
[
'pre'
]
=
pre_basic_write
schedule
[
'post'
]
=
post_basic
schedule
[
'main'
]
=
main_basic_write
return
schedule
def
_aio_handle_tasklet
(
pool_params
):
args
,
tid
,
read_op
=
pool_params
# Create schedule
schedule
=
get_schedule
(
args
,
read_op
)
task_log
(
tid
,
f
'schedule =
{
schedule
}
'
)
task_barrier
(
aio_barrier
,
args
.
threads
)
# Run pre task
task_log
(
tid
,
f
'running pre-task'
)
ctxt
=
schedule
[
"pre"
]((
args
,
tid
))
task_barrier
(
aio_barrier
,
args
.
threads
)
# Run main tasks in a loop
ctxt
[
"main_task_sec"
]
=
0
for
i
in
range
(
args
.
loops
):
task_log
(
tid
,
f
'running main task
{
i
}
'
)
start_time
=
time
.
time
()
ctxt
=
schedule
[
"main"
]((
args
,
tid
,
ctxt
))
task_barrier
(
aio_barrier
,
args
.
threads
)
stop_time
=
time
.
time
()
ctxt
[
"main_task_sec"
]
+=
stop_time
-
start_time
# Run post task
task_log
(
tid
,
f
'running post-task'
)
ctxt
=
schedule
[
"post"
]((
args
,
tid
,
ctxt
))
task_barrier
(
aio_barrier
,
args
.
threads
)
return
ctxt
[
"main_task_sec"
],
ctxt
[
"elapsed_sec"
],
ctxt
[
"num_bytes"
]
*
args
.
loops
def
_init_tasklet
(
b
):
global
aio_barrier
aio_barrier
=
b
def
aio_basic_multiprocessing
(
args
,
read_op
):
b
=
Barrier
(
args
.
threads
)
pool_params
=
[(
args
,
p
,
read_op
)
for
p
in
range
(
args
.
threads
)]
with
Pool
(
processes
=
args
.
threads
,
initializer
=
_init_tasklet
,
initargs
=
(
b
,
))
as
p
:
pool_results
=
p
.
map
(
_aio_handle_tasklet
,
pool_params
)
report_results
(
args
,
read_op
,
pool_results
)
deepspeed/ops/csrc/aio/py_test/ds_aio_handle.py
deleted
100644 → 0
View file @
1b2721ad
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
torch
import
os
import
time
from
multiprocessing
import
Pool
,
Barrier
from
deepspeed.ops.aio
import
AsyncIOBuilder
from
test_ds_aio_utils
import
report_results
,
task_log
,
task_barrier
def
pre_handle
(
args
,
tid
,
read_op
):
io_string
=
"Read"
if
read_op
else
"Write"
num_bytes
=
os
.
path
.
getsize
(
args
.
read_file
)
if
read_op
else
args
.
write_size
file
=
args
.
read_file
if
read_op
else
f
'
{
args
.
write_file
}
.
{
tid
}
'
task_log
(
tid
,
f
'Allocate tensor of size
{
num_bytes
}
bytes'
)
if
args
.
gpu
:
buffer
=
torch
.
empty
(
num_bytes
,
dtype
=
torch
.
uint8
,
device
=
'cuda'
)
else
:
buffer
=
torch
.
empty
(
num_bytes
,
dtype
=
torch
.
uint8
,
device
=
'cpu'
).
pin_memory
()
task_log
(
tid
,
f
'
{
io_string
}
file
{
file
}
of size
{
num_bytes
}
bytes from buffer on device
{
buffer
.
device
}
'
)
io_parallel
=
args
.
io_parallel
if
args
.
io_parallel
else
1
handle
=
AsyncIOBuilder
().
load
().
aio_handle
(
args
.
block_size
,
args
.
queue_depth
,
args
.
single_submit
,
args
.
overlap_events
,
io_parallel
)
task_log
(
tid
,
f
'created deepspeed aio handle'
)
ctxt
=
{}
ctxt
[
'file'
]
=
file
ctxt
[
'num_bytes'
]
=
num_bytes
ctxt
[
'handle'
]
=
handle
ctxt
[
'buffer'
]
=
buffer
ctxt
[
'elapsed_sec'
]
=
0
return
ctxt
def
pre_handle_read
(
pool_params
):
args
,
tid
=
pool_params
ctxt
=
pre_handle
(
args
,
tid
,
True
)
return
ctxt
def
pre_handle_write
(
pool_params
):
args
,
tid
=
pool_params
ctxt
=
pre_handle
(
args
,
tid
,
False
)
return
ctxt
def
post_handle
(
pool_params
):
_
,
_
,
ctxt
=
pool_params
ctxt
[
"buffer"
].
detach
()
ctxt
[
"buffer"
]
=
None
return
ctxt
def
main_parallel_read
(
pool_params
):
args
,
tid
,
ctxt
=
pool_params
handle
=
ctxt
[
'handle'
]
start_time
=
time
.
time
()
ret
=
handle
.
pread
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
validate
,
True
)
assert
ret
!=
-
1
handle
.
wait
()
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
main_parallel_write
(
pool_params
):
args
,
tid
,
ctxt
=
pool_params
handle
=
ctxt
[
'handle'
]
start_time
=
time
.
time
()
ret
=
handle
.
pwrite
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
validate
,
True
)
assert
ret
!=
-
1
handle
.
wait
()
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
main_handle_read
(
pool_parms
):
args
,
tid
,
ctxt
=
pool_parms
handle
=
ctxt
[
'handle'
]
start_time
=
time
.
time
()
ret
=
handle
.
read
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
validate
)
assert
ret
!=
-
1
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
main_handle_write
(
pool_parms
):
args
,
tid
,
ctxt
=
pool_parms
handle
=
ctxt
[
'handle'
]
start_time
=
time
.
time
()
ret
=
handle
.
write
(
ctxt
[
'buffer'
],
ctxt
[
'file'
],
args
.
validate
)
assert
ret
!=
-
1
end_time
=
time
.
time
()
ctxt
[
'elapsed_sec'
]
+=
end_time
-
start_time
return
ctxt
def
get_schedule
(
args
,
read_op
):
schedule
=
{}
if
read_op
:
schedule
[
'pre'
]
=
pre_handle_read
schedule
[
'post'
]
=
post_handle
schedule
[
'main'
]
=
main_parallel_read
if
args
.
io_parallel
else
main_handle_read
else
:
schedule
[
'pre'
]
=
pre_handle_write
schedule
[
'post'
]
=
post_handle
schedule
[
'main'
]
=
main_parallel_write
if
args
.
io_parallel
else
main_handle_write
return
schedule
def
_aio_handle_tasklet
(
pool_params
):
args
,
tid
,
read_op
=
pool_params
# Create schedule
schedule
=
get_schedule
(
args
,
read_op
)
task_log
(
tid
,
f
'schedule =
{
schedule
}
'
)
task_barrier
(
aio_barrier
,
args
.
threads
)
# Run pre task
task_log
(
tid
,
f
'running pre-task'
)
ctxt
=
schedule
[
"pre"
]((
args
,
tid
))
task_barrier
(
aio_barrier
,
args
.
threads
)
# Run main tasks in a loop
ctxt
[
"main_task_sec"
]
=
0
for
i
in
range
(
args
.
loops
):
task_log
(
tid
,
f
'running main task
{
i
}
'
)
start_time
=
time
.
time
()
ctxt
=
schedule
[
"main"
]((
args
,
tid
,
ctxt
))
task_barrier
(
aio_barrier
,
args
.
threads
)
stop_time
=
time
.
time
()
ctxt
[
"main_task_sec"
]
+=
stop_time
-
start_time
# Run post task
task_log
(
tid
,
f
'running post-task'
)
ctxt
=
schedule
[
"post"
]((
args
,
tid
,
ctxt
))
task_barrier
(
aio_barrier
,
args
.
threads
)
return
ctxt
[
"main_task_sec"
],
ctxt
[
"elapsed_sec"
],
ctxt
[
"num_bytes"
]
*
args
.
loops
def
_init_tasklet
(
b
):
global
aio_barrier
aio_barrier
=
b
def
aio_handle_multiprocessing
(
args
,
read_op
):
b
=
Barrier
(
args
.
threads
)
pool_params
=
[(
args
,
p
,
read_op
)
for
p
in
range
(
args
.
threads
)]
with
Pool
(
processes
=
args
.
threads
,
initializer
=
_init_tasklet
,
initargs
=
(
b
,
))
as
p
:
pool_results
=
p
.
map
(
_aio_handle_tasklet
,
pool_params
)
report_results
(
args
,
read_op
,
pool_results
)
deepspeed/ops/csrc/aio/py_test/parse_aio_stats.py
deleted
100644 → 0
View file @
1b2721ad
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
import
argparse
import
re
READ_SPEED
=
'read_speed'
WRITE_SPEED
=
'write_speed'
PERF_METRICS
=
[
READ_SPEED
,
WRITE_SPEED
]
METRIC_SEARCH
=
{
READ_SPEED
:
'E2E Read Speed'
,
WRITE_SPEED
:
'E2E Write Speed'
}
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
required
=
True
,
help
=
'Folder of statistics logs'
)
parser
.
add_argument
(
'--metric'
,
type
=
str
,
required
=
True
,
help
=
'Performance metric to report: [read_speed|write_speed]'
)
args
=
parser
.
parse_args
()
print
(
f
'args =
{
args
}
'
)
return
args
def
extract_value
(
key
,
file
):
INVALID_PREFIXES
=
[
"ds"
]
for
p
in
INVALID_PREFIXES
:
if
key
.
startswith
(
p
):
return
key
try
:
if
key
[
0
]
in
[
't'
,
'd'
,
'p'
]:
return
int
(
key
[
1
:])
if
key
.
startswith
(
"bs"
):
if
key
.
endswith
(
'K'
):
v
=
key
[
2
:].
split
(
'K'
)
return
int
(
v
[
0
])
*
1024
elif
key
.
endswith
(
'M'
):
v
=
key
[
2
:].
split
(
'M'
)
return
int
(
v
[
0
])
*
1024
*
1024
else
:
return
int
(
key
[
2
:])
except
:
print
(
f
"
{
file
}
: extract_value fails on
{
key
}
"
)
return
None
return
key
def
get_file_key
(
file
):
f
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
file
))
fields
=
f
.
split
(
'_'
)
values
=
[
extract_value
(
k
,
file
)
for
k
in
fields
]
return
tuple
(
values
)
def
get_thread_count
(
file
):
f
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
file
))
fields
=
f
.
split
(
'_'
)
for
key
in
fields
:
if
key
[
0
]
==
't'
:
return
int
(
key
[
1
:])
return
1
"""
Extract performance metric from log file.
Sample file lines are:
Task Read Latency = 0.031647682189941406 sec
Task Read Speed = 12.342926020792527 GB/sec
E2E Read Latency = 0.031697988510131836 sec
E2E Read Speed = 12.323337169333062 GB/sec
For the above sample, -metric = "read_speed" corresponds to "E2E Read Speed", and 12.32 will be returned
"""
def
get_metric
(
file
,
metric
):
thread_count
=
get_thread_count
(
file
)
with
open
(
file
)
as
f
:
for
line
in
f
.
readlines
():
if
line
.
startswith
(
METRIC_SEARCH
[
metric
]):
if
metric
in
[
READ_SPEED
,
WRITE_SPEED
]:
fields
=
line
.
split
()
return
float
(
fields
[
-
2
])
else
:
fields
=
line
.
split
(
'='
)
return
float
(
fields
[
-
1
])
return
None
def
validate_args
(
args
):
if
not
args
.
metric
in
PERF_METRICS
:
print
(
f
'
{
args
.
metric
}
is not a valid performance metrics'
)
return
False
if
not
os
.
path
.
isdir
(
args
.
log_dir
):
print
(
f
'
{
args
.
log_dir
}
folder is not existent'
)
return
False
return
True
def
get_results
(
log_files
,
metric
):
results
=
{}
for
f
in
log_files
:
file_key
=
get_file_key
(
f
)
value
=
get_metric
(
f
,
metric
)
results
[
file_key
]
=
value
return
results
def
get_sorted_results
(
log_dir
,
metric
):
log_files
=
[
f
for
f
in
os
.
listdir
(
log_dir
)
if
os
.
path
.
isfile
(
os
.
path
.
join
(
log_dir
,
f
))
]
log_files_path
=
[
os
.
path
.
join
(
log_dir
,
f
)
for
f
in
log_files
]
results
=
get_results
(
log_files_path
,
metric
)
result_keys
=
list
(
results
.
keys
())
sorted_keys
=
sorted
(
result_keys
)
return
sorted_keys
,
results
def
main
():
print
(
"Parsing aio statistics"
)
args
=
parse_arguments
()
if
not
validate_args
(
args
):
quit
()
sorted_keys
,
results
=
get_sorted_results
(
args
.
log_dir
,
args
.
metric
)
for
k
in
sorted_keys
:
print
(
f
'
{
k
}
=
{
results
[
k
]
}
'
)
if
__name__
==
"__main__"
:
main
()
deepspeed/ops/csrc/aio/py_test/perf_sweep_utils.py
deleted
100644 → 0
View file @
1b2721ad
SCRIPT_PREFIX
=
'_aio_bench'
WRITE_OP_DESC
=
'write'
READ_OP_DESC
=
'read'
READ_IO_DIR
=
f
'
{
SCRIPT_PREFIX
}
_
{
READ_OP_DESC
}
_io'
WRITE_IO_DIR
=
f
'
{
SCRIPT_PREFIX
}
_
{
WRITE_OP_DESC
}
_io'
BENCH_LOG_DIR
=
f
'
{
SCRIPT_PREFIX
}
_logs'
READ_LOG_DIR
=
f
'
{
SCRIPT_PREFIX
}
_
{
READ_OP_DESC
}
_logs'
WRITE_LOG_DIR
=
f
'
{
SCRIPT_PREFIX
}
_
{
WRITE_OP_DESC
}
_logs'
deepspeed/ops/csrc/aio/py_test/run_read_sweep.sh
deleted
100644 → 0
View file @
1b2721ad
#!/bin/bash
if
[[
$#
-ne
2
]]
;
then
echo
"Usage:
$0
<input file> <output log dir>"
exit
1
fi
function
validate_environment
()
{
validate_cmd
=
"python ./validate_async_io.py"
eval
${
validate_cmd
}
res
=
$?
if
[[
$res
!=
0
]]
;
then
echo
"Failing because environment is not properly configured"
echo
"Possible fix: sudo apt-get install libaio-dev"
exit
1
fi
}
validate_environment
INPUT_FILE
=
$1
if
[[
!
-f
${
INPUT_FILE
}
]]
;
then
echo
"Input file not found:
${
INPUT_FILE
}
"
exit
1
fi
LOG_DIR
=
$2
/aio_perf_sweep
RUN_SCRIPT
=
./test_ds_aio.py
READ_OPT
=
"--read_file
${
INPUT_FILE
}
"
if
[[
-d
${
LOG_DIR
}
]]
;
then
rm
-f
${
LOG_DIR
}
/
*
else
mkdir
-p
${
LOG_DIR
}
fi
DISABLE_CACHE
=
"sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
SYNC
=
"sync"
for
sub
in
single block
;
do
if
[[
$sub
==
"single"
]]
;
then
sub_opt
=
"--single_submit"
else
sub_opt
=
""
fi
for
ov
in
overlap sequential
;
do
if
[[
$ov
==
"overlap"
]]
;
then
ov_opt
=
"--overlap_events"
else
ov_opt
=
""
fi
for
t
in
1 2 4 8
;
do
for
p
in
1
;
do
for
d
in
1 2 4 8 16 32
;
do
for
bs
in
128K 256K 512K 1M
;
do
SCHED_OPTS
=
"
${
sub_opt
}
${
ov_opt
}
--handle --threads
${
t
}
"
OPTS
=
"--io_parallel
${
p
}
--queue_depth
${
d
}
--block_size
${
bs
}
"
LOG
=
"
${
LOG_DIR
}
/read_
${
sub
}
_
${
ov
}
_t
${
t
}
_p
${
p
}
_d
${
d
}
_bs
${
bs
}
.txt"
cmd
=
"python
${
RUN_SCRIPT
}
${
READ_OPT
}
${
OPTS
}
${
SCHED_OPTS
}
&>
${
LOG
}
"
echo
${
DISABLE_CACHE
}
echo
${
cmd
}
echo
${
SYNC
}
eval
${
DISABLE_CACHE
}
eval
${
cmd
}
eval
${
SYNC
}
sleep
2
done
done
done
done
done
done
Prev
1
…
10
11
12
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment