Commit 7d1a83a9 authored by aiss's avatar aiss
Browse files

push Deepspeed 0.6.3 rocm version

parent ab5534fc
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <condition_variable>
#include <memory>
#include "deepspeed_aio_thread.h"
struct deepspeed_aio_handle_t {
std::unique_ptr<struct aio_context> _aio_ctxt;
const bool _single_submit;
const bool _overlap_events;
const int _num_threads;
deepspeed_aio_config_t _aio_config;
std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
std::vector<std::thread> _threads;
int _num_pending_ops;
deepspeed_aio_handle_t(const int block_size,
const int queue_depth,
const bool single_submit,
const bool overlap_events,
const int num_threads);
~deepspeed_aio_handle_t();
const int get_block_size() const;
const int get_queue_depth() const;
const bool get_single_submit() const;
const bool get_overlap_events() const;
const int get_thread_count() const;
int read(torch::Tensor& buffer, const char* filename, const bool validate);
int write(const torch::Tensor& buffer, const char* filename, const bool validate);
int pread(const torch::Tensor& buffer,
const char* filename,
const bool validate,
const bool async);
int pwrite(const torch::Tensor& buffer,
const char* filename,
const bool validate,
const bool async);
int sync_pread(torch::Tensor& buffer, const char* filename);
int sync_pwrite(const torch::Tensor& buffer, const char* filename);
int async_pread(torch::Tensor& buffer, const char* filename);
int async_pwrite(const torch::Tensor& buffer, const char* filename);
int wait();
void _stop_threads();
void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
};
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include "deepspeed_py_copy.h"
#include <omp.h>
#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
#if defined(__AVX512__) or defined(__AVX256__)
union AVX_Data {
#if defined(__AVX512__)
__m512 data;
#else
__m256 data;
#endif
};
#endif
static void helper_memcpy_1(float* dest, float* src, size_t param_size)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
rounded_size = ROUND_DOWN(param_size, SIMD_WIDTH);
for (size_t t = 0; t < rounded_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
size_t offset = copy_size + t;
#pragma omp parallel for
for (size_t i = t; i < offset; i += SIMD_WIDTH) {
AVX_Data src_4;
src_4.data = SIMD_LOAD(src + i);
SIMD_STORE(dest + i, src_4.data);
}
}
#endif
if (param_size > rounded_size) {
#pragma omp parallel for
for (size_t k = rounded_size; k < param_size; k++) { dest[k] = src[k]; }
}
}
static void helper_memcpy_4(float* dest, float* src, size_t param_size)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
for (size_t t = 0; t < rounded_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
size_t offset = copy_size + t;
#pragma omp parallel for
for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
AVX_Data src_4[4];
src_4[0].data = SIMD_LOAD(src + i);
src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
SIMD_STORE(dest + i, src_4[0].data);
SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
}
}
#endif
if (param_size > rounded_size)
helper_memcpy_1((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
}
static void helper_mempcy_8(float* dest, float* src, size_t param_size)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
for (size_t t = 0; t < rounded_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
size_t offset = copy_size + t;
#pragma omp parallel for
for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
AVX_Data src_4[8];
src_4[0].data = SIMD_LOAD(src + i);
src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
src_4[4].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 2));
src_4[5].data = SIMD_LOAD(src + i + SIMD_WIDTH * 5);
src_4[6].data = SIMD_LOAD(src + i + SIMD_WIDTH * 6);
src_4[7].data = SIMD_LOAD(src + i + SIMD_WIDTH * 7);
SIMD_STORE(dest + i, src_4[0].data);
SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
SIMD_STORE(dest + i + (SIMD_WIDTH << 2), src_4[4].data);
SIMD_STORE(dest + i + SIMD_WIDTH * 5, src_4[5].data);
SIMD_STORE(dest + i + SIMD_WIDTH * 6, src_4[6].data);
SIMD_STORE(dest + i + SIMD_WIDTH * 7, src_4[7].data);
}
}
#endif
if (param_size > rounded_size)
helper_memcpy_4((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
}
int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src)
{
auto dest_c = dest.contiguous();
auto src_c = src.contiguous();
float* dest_ptr = (float*)dest_c.data_ptr();
float* src_ptr = (float*)src_c.data_ptr();
helper_mempcy_8(dest_ptr, src_ptr, dest_c.size(0));
return 0;
}
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#if (__x86_64__ || __i386__)
#include <cpuid.h>
#include <x86intrin.h>
#endif
#include <deepspeed_aio_common.h>
#include <stdlib.h>
#include <torch/extension.h>
#define TILE (1024 * 1024 * 1024)
#if defined(__AVX512__)
#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm512_loadu_ps(x)
#define SIMD_SET(x) _mm512_set1_ps(x)
#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
#define SIMD_WIDTH 16
#else
#if defined(__AVX256__)
#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm256_loadu_ps(x)
#define SIMD_SET(x) _mm256_set1_ps(x)
#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
#define SIMD_WIDTH 8
#endif
#endif
int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src);
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <torch/extension.h>
#include "deepspeed_py_aio_handle.h"
#include "deepspeed_py_copy.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchronous I/O Read");
m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchronous I/O Write");
m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
py::class_<deepspeed_aio_handle_t>(m, "aio_handle")
.def(py::init<const int, const int, const bool, const bool, const int>())
.def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
.def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
.def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
.def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
.def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
.def("read", &deepspeed_aio_handle_t::read)
.def("write", &deepspeed_aio_handle_t::write)
.def("pread", &deepspeed_aio_handle_t::pread)
.def("pwrite", &deepspeed_aio_handle_t::pwrite)
.def("sync_pread", &deepspeed_aio_handle_t::sync_pread)
.def("sync_pwrite", &deepspeed_aio_handle_t::sync_pwrite)
.def("async_pread", &deepspeed_aio_handle_t::async_pread)
.def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
.def("wait", &deepspeed_aio_handle_t::wait);
}
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import os
import argparse
import json
from parse_aio_stats import READ_SPEED, WRITE_SPEED, get_sorted_results
from perf_sweep_utils import BENCH_LOG_DIR, READ_LOG_DIR, WRITE_LOG_DIR
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
'--log_dir',
type=str,
default=BENCH_LOG_DIR,
help=
f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}'
)
args = parser.parse_args()
print(f'args = {args}')
return args
def validate_args(args):
for d in [READ_LOG_DIR, WRITE_LOG_DIR]:
log_dir = os.path.join(args.log_dir, d)
if not os.path.isdir(log_dir):
print(f'{log_dir} folder is not existent')
return False
return True
def convert_to_param(key):
assert len(key) == 6
return {
"single_submit": "true" if key[0] == "single" else "false",
"overlap_events": "true" if key[1] == "overlap" else "false",
"thread_count": int(key[3]),
"queue_depth": int(key[4]),
"block_size": int(key[5])
}
def generate_aio_param(read_log_dir, write_log_dir):
_, read_results = get_sorted_results(read_log_dir, READ_SPEED)
_, write_results = get_sorted_results(write_log_dir, WRITE_SPEED)
combined_perf = {key[1:]: value for key, value in read_results.items()}
for key, value in write_results.items():
new_key = key[1:]
if new_key in combined_perf:
combined_perf[new_key] += value
else:
combined_perf[new_key] = 0
optimal_key = None
optimal_perf = 0.0
for key, value in combined_perf.items():
if value > optimal_perf:
optimal_perf = value
optimal_key = key
aio_param = {"aio": convert_to_param(optimal_key)}
read_perf_keys = {key[1:]: key for key in read_results.keys()}
write_perf_keys = {key[1:]: key for key in write_results.keys()}
optimal_config_read = read_results.get(read_perf_keys[optimal_key], None)
optimal_config_write = write_results.get(write_perf_keys[optimal_key], None)
print(
f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}'
)
print(json.dumps(aio_param, indent=3))
def main():
print('Generate aio param')
args = parse_arguments()
if not validate_args(args):
quit()
read_log_dir = os.path.join(args.log_dir, READ_LOG_DIR)
write_log_dir = os.path.join(args.log_dir, WRITE_LOG_DIR)
generate_aio_param(read_log_dir, write_log_dir)
if __name__ == "__main__":
main()
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import os
import sys
import argparse
import json
import itertools
import subprocess
import shutil
from test_ds_aio_utils import refine_integer_value
from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR
OTHER_OPTIONS = '--handle'
PERF_SCRIPT = 'test_ds_aio.py'
DEFAULT_SWEEP_CONFIG = {
"block_size": ["128K",
"256K"],
"queue_depth": [4,
16,
32],
"overlap_events": [True,
False],
"io_parallel": [2,
8],
"single_submit": [False]
}
class Job(object):
def __init__(self, cmd_line, output_file=None, work_dir=None):
self.cmd_line = cmd_line
self.output_file = output_file
self.work_dir = work_dir
self.output_fd = None
def cmd(self):
return self.cmd_line
def get_stdout(self):
return self.output_fd
def get_stderr(self):
return self.output_fd
def get_cwd(self):
return self.work_dir
def open_output_file(self):
if self.output_file is not None:
self.output_fd = open(self.output_file, 'w')
def close_output_file(self):
if self.output_fd is not None:
self.output_fd.close()
self.output_fd = None
class SweepConfig(object):
def __init__(self, args):
self.nvme_dir = args.nvme_dir
self.io_size = args.io_size
self.search_space = get_sweep_config_dict(args.sweep_config)
self.read = not args.no_read
self.write = not args.no_write
self.flush_cache = not args.no_sudo
self.log_dir = args.log_dir
self.loops = args.loops
self.other_options = f'{OTHER_OPTIONS} --loops {args.loops}'
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
'--nvme_dir',
required=True,
type=str,
help=
'Directory in which to perform I/O tests. A writeable directory on a NVMe device.'
)
parser.add_argument('--sweep_config',
type=str,
default=None,
help='Performance sweep configuration json file.')
parser.add_argument('--no_read',
action='store_true',
help='Disable read performance measurements.')
parser.add_argument('--no_write',
action='store_true',
help='Disable write performance measurements.')
parser.add_argument(
'--io_size',
type=str,
default="400M",
help='Number of I/O bytes to read/write for performance measurements.')
parser.add_argument(
'--no_sudo',
action='store_true',
help=
'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.'
)
parser.add_argument(
'--log_dir',
type=str,
default=BENCH_LOG_DIR,
help=
f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}'
)
parser.add_argument('--loops',
type=int,
default=1,
help='Count of operation repetitions')
args = parser.parse_args()
print(f'args = {args}')
return args
def dump_cmd_lines(cmd_lines):
print(f'cmd line count = {len(cmd_lines)}')
for i, cmd in enumerate(cmd_lines):
print(f'{i}: {cmd}')
def get_sweep_config_dict(sweep_config_json):
if sweep_config_json is None:
return DEFAULT_SWEEP_CONFIG
with open(sweep_config_json) as fp:
sweep_config = json.load(fp)
return sweep_config
def get_sweep_cmd_lines(sweep_config_dict):
def flatten_options(key, value_list):
flat_list = []
for v in value_list:
if not type(v) is bool:
flat_list.append(f'--{key} {v}')
elif v:
flat_list.append(f'--{key}')
else:
flat_list.append(' ')
return flat_list
flat_list = [flatten_options(key, value) for key, value in sweep_config_dict.items()]
cmd_list = list(itertools.product(*flat_list))
cmd_list = [list(cmd) for cmd in cmd_list]
#dump_cmd_lines(cmd_list)
return cmd_list
def run_job(job):
args = ' '.join(job.cmd())
print(f'args = {args}')
job.open_output_file()
proc = subprocess.run(args=args,
shell=True,
stdout=job.get_stdout(),
stderr=job.get_stderr(),
cwd=job.get_cwd())
job.close_output_file()
assert proc.returncode == 0, \
f"This command failed: {job.cmd()}"
def launch_sweep(sweep_jobs, sync_job, flush_cache_job):
for perf_job in sweep_jobs:
if flush_cache_job is not None:
run_job(sync_job)
run_job(flush_cache_job)
run_job(perf_job)
run_job(sync_job)
def create_cmd_tags(cmd_line):
tags = {}
for param_value in cmd_line:
fields = param_value.split()
if len(fields) == 1:
tags[fields[0]] = None
elif len(fields) == 2:
tags[fields[0]] = fields[1]
return tags
def get_log_file(io_op_desc, cmd_line):
QUEUE_DEPTH = "--queue_depth"
BLOCK_SIZE = "--block_size"
SINGLE_SUBMIT = "--single_submit"
OVERLAP_EVENTS = "--overlap_events"
THREAD_COUNT = "--threads"
IO_PARALLEL = "--io_parallel"
tag_map = {
QUEUE_DEPTH: "d",
BLOCK_SIZE: "bs",
SINGLE_SUBMIT: "single",
OVERLAP_EVENTS: "overlap",
THREAD_COUNT: "t",
IO_PARALLEL: "p"
}
tag_default = {
QUEUE_DEPTH: 1,
BLOCK_SIZE: "1M",
SINGLE_SUBMIT: "block",
OVERLAP_EVENTS: "sequential",
THREAD_COUNT: 1,
IO_PARALLEL: 1
}
def get_default_value(tag):
value = tag_default[tag]
if tag in [SINGLE_SUBMIT, OVERLAP_EVENTS]:
return value
return f'{tag_map[tag]}{value}'
def get_config_value(tag, value):
tag_key = tag_map[tag]
if value is None:
return tag_key
return f'{tag_key}{value}'
tag_list = [
SINGLE_SUBMIT,
OVERLAP_EVENTS,
THREAD_COUNT,
IO_PARALLEL,
QUEUE_DEPTH,
BLOCK_SIZE
]
log_tags = [io_op_desc]
cmd_tags = create_cmd_tags(cmd_line)
for tag in tag_list:
if tag in cmd_tags:
log_tags.append(get_config_value(tag, cmd_tags[tag]))
else:
log_tags.append(get_default_value(tag))
log_file = '_'.join(log_tags)
log_file += '.txt'
return log_file
def create_perf_jobs(io_op_desc, log_dir, cmd_lines):
py_cmd = ['python', os.path.join(script_path(), PERF_SCRIPT)]
perf_jobs = []
for cmd in cmd_lines:
log_file = os.path.join(log_dir, get_log_file(io_op_desc, cmd))
job = Job(cmd_line=py_cmd + cmd, output_file=log_file)
perf_jobs.append(job)
return perf_jobs
def script_path():
return os.path.dirname(os.path.realpath(sys.argv[0]))
def async_io_setup():
import deepspeed
from deepspeed.ops.aio import AsyncIOBuilder
return AsyncIOBuilder().is_compatible()
def get_block_size_and_count(io_bytes):
block_size = 1
block_count = io_bytes
bytes_in_KB = 1024
while block_count % bytes_in_KB == 0:
block_size *= bytes_in_KB
block_count /= bytes_in_KB
return int(block_size), int(block_count)
def create_read_file(sweep_config):
read_folder = os.path.join(sweep_config.nvme_dir, f'{READ_IO_DIR}')
os.makedirs(read_folder, exist_ok=True)
read_file_name = os.path.join(read_folder, f'random_{sweep_config.io_size}B.pt')
block_size, block_count = get_block_size_and_count(refine_integer_value(sweep_config.io_size))
dd_job = Job(cmd_line=[
f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'
])
print(
f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
)
run_job(dd_job)
print(
f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
)
return read_folder, read_file_name
def remove_folder(folder):
assert os.path.isdir(folder), f"Error: cannot remove {folder} - folder not found"
shutil.rmtree(folder)
def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
read_folder, read_file_name = create_read_file(sweep_config)
read_option = f'--read_file {read_file_name}'
read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd
for cmd in cmd_lines]
#dump_cmd_lines(read_cmd_lines)
log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}')
os.makedirs(log_folder, exist_ok=True)
perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC,
log_dir=log_folder,
cmd_lines=read_cmd_lines)
launch_sweep(sweep_jobs=perf_jobs,
sync_job=sync_job,
flush_cache_job=flush_cache_job)
remove_folder(read_folder)
def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
write_folder = os.path.join(sweep_config.nvme_dir, f'{WRITE_IO_DIR}')
os.makedirs(write_folder, exist_ok=True)
write_file_name = os.path.join(write_folder, f'random_{sweep_config.io_size}B.pt')
write_option = f'--write_size {sweep_config.io_size} --write_file {write_file_name}'
write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd
for cmd in cmd_lines]
#dump_cmd_lines(write_cmd_lines)
log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}')
os.makedirs(log_folder, exist_ok=True)
perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC,
log_dir=log_folder,
cmd_lines=write_cmd_lines)
launch_sweep(sweep_jobs=perf_jobs,
sync_job=sync_job,
flush_cache_job=flush_cache_job)
remove_folder(write_folder)
def main():
print("Running performance sweep of deepspeed nvme library")
if not async_io_setup():
error_msg = """
Failing because environment is not properly configured for deepspeed async i/o module.
Possible fix: apt install libaio-dev.
"""
print(error_msg)
quit()
args = parse_arguments()
sweep_config = SweepConfig(args)
cmd_lines = get_sweep_cmd_lines(sweep_config.search_space)
if sweep_config.flush_cache:
flush_cache_job = Job(
cmd_line=['sudo',
'bash -c',
"'echo 1 > /proc/sys/vm/drop_caches'"])
else:
flush_cache_job = None
sync_job = Job(cmd_line=['sync'])
if sweep_config.read:
run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines)
if sweep_config.write:
run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines)
if __name__ == "__main__":
main()
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import torch
import os
import time
from deepspeed.ops.aio import AsyncIOBuilder
from multiprocessing import Pool, Barrier
from test_ds_aio_utils import report_results, task_log, task_barrier
def pre_basic(args, tid, read_op):
io_string = "Read" if read_op else "Write"
num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
file = args.read_file if read_op else f'{args.write_file}.{tid}'
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
task_log(
tid,
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
)
ctxt = {}
ctxt['file'] = file
ctxt['num_bytes'] = num_bytes
ctxt['buffer'] = buffer
ctxt['elapsed_sec'] = 0
return ctxt
def pre_basic_read(pool_params):
args, tid = pool_params
ctxt = pre_basic(args, tid, True)
return ctxt
def pre_basic_write(pool_params):
args, tid = pool_params
ctxt = pre_basic(args, tid, False)
return ctxt
def post_basic(pool_params):
_, _, ctxt = pool_params
ctxt["buffer"].detach()
ctxt["buffer"] = None
return ctxt
def main_basic_read(pool_params):
args, tid, ctxt = pool_params
start_time = time.time()
AsyncIOBuilder().load().aio_read(ctxt['buffer'],
ctxt['file'],
args.block_size,
args.queue_depth,
args.single_submit,
args.overlap_events,
args.validate)
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
return ctxt
def main_basic_write(pool_params):
args, tid, ctxt = pool_params
start_time = time.time()
AsyncIOBuilder().load().aio_write(ctxt['buffer'],
ctxt['file'],
args.block_size,
args.queue_depth,
args.single_submit,
args.overlap_events,
args.validate)
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
return ctxt
def get_schedule(args, read_op):
schedule = {}
if read_op:
schedule['pre'] = pre_basic_read
schedule['post'] = post_basic
schedule['main'] = main_basic_read
else:
schedule['pre'] = pre_basic_write
schedule['post'] = post_basic
schedule['main'] = main_basic_write
return schedule
def _aio_handle_tasklet(pool_params):
args, tid, read_op = pool_params
# Create schedule
schedule = get_schedule(args, read_op)
task_log(tid, f'schedule = {schedule}')
task_barrier(aio_barrier, args.threads)
# Run pre task
task_log(tid, f'running pre-task')
ctxt = schedule["pre"]((args, tid))
task_barrier(aio_barrier, args.threads)
# Run main tasks in a loop
ctxt["main_task_sec"] = 0
for i in range(args.loops):
task_log(tid, f'running main task {i}')
start_time = time.time()
ctxt = schedule["main"]((args, tid, ctxt))
task_barrier(aio_barrier, args.threads)
stop_time = time.time()
ctxt["main_task_sec"] += stop_time - start_time
# Run post task
task_log(tid, f'running post-task')
ctxt = schedule["post"]((args, tid, ctxt))
task_barrier(aio_barrier, args.threads)
return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
def _init_tasklet(b):
global aio_barrier
aio_barrier = b
def aio_basic_multiprocessing(args, read_op):
b = Barrier(args.threads)
pool_params = [(args, p, read_op) for p in range(args.threads)]
with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
pool_results = p.map(_aio_handle_tasklet, pool_params)
report_results(args, read_op, pool_results)
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import torch
import os
import time
from multiprocessing import Pool, Barrier
from deepspeed.ops.aio import AsyncIOBuilder
from test_ds_aio_utils import report_results, task_log, task_barrier
def pre_handle(args, tid, read_op):
io_string = "Read" if read_op else "Write"
num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
file = args.read_file if read_op else f'{args.write_file}.{tid}'
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
if args.gpu:
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
else:
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
task_log(
tid,
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
)
io_parallel = args.io_parallel if args.io_parallel else 1
handle = AsyncIOBuilder().load().aio_handle(args.block_size,
args.queue_depth,
args.single_submit,
args.overlap_events,
io_parallel)
task_log(tid, f'created deepspeed aio handle')
ctxt = {}
ctxt['file'] = file
ctxt['num_bytes'] = num_bytes
ctxt['handle'] = handle
ctxt['buffer'] = buffer
ctxt['elapsed_sec'] = 0
return ctxt
def pre_handle_read(pool_params):
args, tid = pool_params
ctxt = pre_handle(args, tid, True)
return ctxt
def pre_handle_write(pool_params):
args, tid = pool_params
ctxt = pre_handle(args, tid, False)
return ctxt
def post_handle(pool_params):
_, _, ctxt = pool_params
ctxt["buffer"].detach()
ctxt["buffer"] = None
return ctxt
def main_parallel_read(pool_params):
args, tid, ctxt = pool_params
handle = ctxt['handle']
start_time = time.time()
ret = handle.pread(ctxt['buffer'], ctxt['file'], args.validate, True)
assert ret != -1
handle.wait()
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
return ctxt
def main_parallel_write(pool_params):
args, tid, ctxt = pool_params
handle = ctxt['handle']
start_time = time.time()
ret = handle.pwrite(ctxt['buffer'], ctxt['file'], args.validate, True)
assert ret != -1
handle.wait()
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
return ctxt
def main_handle_read(pool_parms):
args, tid, ctxt = pool_parms
handle = ctxt['handle']
start_time = time.time()
ret = handle.read(ctxt['buffer'], ctxt['file'], args.validate)
assert ret != -1
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
return ctxt
def main_handle_write(pool_parms):
args, tid, ctxt = pool_parms
handle = ctxt['handle']
start_time = time.time()
ret = handle.write(ctxt['buffer'], ctxt['file'], args.validate)
assert ret != -1
end_time = time.time()
ctxt['elapsed_sec'] += end_time - start_time
return ctxt
def get_schedule(args, read_op):
schedule = {}
if read_op:
schedule['pre'] = pre_handle_read
schedule['post'] = post_handle
schedule['main'] = main_parallel_read if args.io_parallel else main_handle_read
else:
schedule['pre'] = pre_handle_write
schedule['post'] = post_handle
schedule['main'] = main_parallel_write if args.io_parallel else main_handle_write
return schedule
def _aio_handle_tasklet(pool_params):
args, tid, read_op = pool_params
# Create schedule
schedule = get_schedule(args, read_op)
task_log(tid, f'schedule = {schedule}')
task_barrier(aio_barrier, args.threads)
# Run pre task
task_log(tid, f'running pre-task')
ctxt = schedule["pre"]((args, tid))
task_barrier(aio_barrier, args.threads)
# Run main tasks in a loop
ctxt["main_task_sec"] = 0
for i in range(args.loops):
task_log(tid, f'running main task {i}')
start_time = time.time()
ctxt = schedule["main"]((args, tid, ctxt))
task_barrier(aio_barrier, args.threads)
stop_time = time.time()
ctxt["main_task_sec"] += stop_time - start_time
# Run post task
task_log(tid, f'running post-task')
ctxt = schedule["post"]((args, tid, ctxt))
task_barrier(aio_barrier, args.threads)
return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
def _init_tasklet(b):
global aio_barrier
aio_barrier = b
def aio_handle_multiprocessing(args, read_op):
b = Barrier(args.threads)
pool_params = [(args, p, read_op) for p in range(args.threads)]
with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
pool_results = p.map(_aio_handle_tasklet, pool_params)
report_results(args, read_op, pool_results)
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import os
import argparse
import re
READ_SPEED = 'read_speed'
WRITE_SPEED = 'write_speed'
PERF_METRICS = [READ_SPEED, WRITE_SPEED]
METRIC_SEARCH = {READ_SPEED: 'E2E Read Speed', WRITE_SPEED: 'E2E Write Speed'}
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--log_dir',
type=str,
required=True,
help='Folder of statistics logs')
parser.add_argument('--metric',
type=str,
required=True,
help='Performance metric to report: [read_speed|write_speed]')
args = parser.parse_args()
print(f'args = {args}')
return args
def extract_value(key, file):
INVALID_PREFIXES = ["ds"]
for p in INVALID_PREFIXES:
if key.startswith(p):
return key
try:
if key[0] in ['t', 'd', 'p']:
return int(key[1:])
if key.startswith("bs"):
if key.endswith('K'):
v = key[2:].split('K')
return int(v[0]) * 1024
elif key.endswith('M'):
v = key[2:].split('M')
return int(v[0]) * 1024 * 1024
else:
return int(key[2:])
except:
print(f"{file}: extract_value fails on {key}")
return None
return key
def get_file_key(file):
f, _ = os.path.splitext(os.path.basename(file))
fields = f.split('_')
values = [extract_value(k, file) for k in fields]
return tuple(values)
def get_thread_count(file):
f, _ = os.path.splitext(os.path.basename(file))
fields = f.split('_')
for key in fields:
if key[0] == 't':
return int(key[1:])
return 1
"""
Extract performance metric from log file.
Sample file lines are:
Task Read Latency = 0.031647682189941406 sec
Task Read Speed = 12.342926020792527 GB/sec
E2E Read Latency = 0.031697988510131836 sec
E2E Read Speed = 12.323337169333062 GB/sec
For the above sample, -metric = "read_speed" corresponds to "E2E Read Speed", and 12.32 will be returned
"""
def get_metric(file, metric):
thread_count = get_thread_count(file)
with open(file) as f:
for line in f.readlines():
if line.startswith(METRIC_SEARCH[metric]):
if metric in [READ_SPEED, WRITE_SPEED]:
fields = line.split()
return float(fields[-2])
else:
fields = line.split('=')
return float(fields[-1])
return None
def validate_args(args):
if not args.metric in PERF_METRICS:
print(f'{args.metric} is not a valid performance metrics')
return False
if not os.path.isdir(args.log_dir):
print(f'{args.log_dir} folder is not existent')
return False
return True
def get_results(log_files, metric):
results = {}
for f in log_files:
file_key = get_file_key(f)
value = get_metric(f, metric)
results[file_key] = value
return results
def get_sorted_results(log_dir, metric):
log_files = [
f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir,
f))
]
log_files_path = [os.path.join(log_dir, f) for f in log_files]
results = get_results(log_files_path, metric)
result_keys = list(results.keys())
sorted_keys = sorted(result_keys)
return sorted_keys, results
def main():
print("Parsing aio statistics")
args = parse_arguments()
if not validate_args(args):
quit()
sorted_keys, results = get_sorted_results(args.log_dir, args.metric)
for k in sorted_keys:
print(f'{k} = {results[k]}')
if __name__ == "__main__":
main()
SCRIPT_PREFIX = '_aio_bench'
WRITE_OP_DESC = 'write'
READ_OP_DESC = 'read'
READ_IO_DIR = f'{SCRIPT_PREFIX}_{READ_OP_DESC}_io'
WRITE_IO_DIR = f'{SCRIPT_PREFIX}_{WRITE_OP_DESC}_io'
BENCH_LOG_DIR = f'{SCRIPT_PREFIX}_logs'
READ_LOG_DIR = f'{SCRIPT_PREFIX}_{READ_OP_DESC}_logs'
WRITE_LOG_DIR = f'{SCRIPT_PREFIX}_{WRITE_OP_DESC}_logs'
This diff is collapsed.
This diff is collapsed.
{
"block_size": [
"128K",
"256K",
"1M"
],
"queue_depth": [
4,
16,
32
],
"io_parallel": [
1,
2,
4,
8
],
"single_submit": [
true,
false
],
"overlap_events": [
true,
false
],
"threads": [
1
]
}
This diff is collapsed.
This diff is collapsed.
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import deepspeed
from deepspeed.ops.aio import AsyncIOBuilder
assert AsyncIOBuilder().is_compatible()
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment