push dsv0.8.2 version

67ea635f · aiss · 1b2721ad · 1b2721ad · 1b2721ad · 1b2721ad
Commit 67ea635f authored Mar 30, 2023 by aiss
20 changed files
--- a/deepspeed/ops/csrc/aio/common/deepspeed_aio_types.cpp
+++ b/deepspeed/ops/csrc/aio/common/deepspeed_aio_types.cpp
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <cmath>
-
-#include "deepspeed_aio_utils.h"
-
-using namespace std;
-
-const int c_block_size = 128 * 1024;
-const int c_io_queue_depth = 8;
-
-deepspeed_aio_config_t::deepspeed_aio_config_t()
-    : _block_size(c_block_size),
-      _queue_depth(c_io_queue_depth),
-      _single_submit(false),
-      _overlap_events(false),
-      _lock_memory(false)
-{
-}
-
-deepspeed_aio_config_t::deepspeed_aio_config_t(const int block_size,
-                                               const int queue_depth,
-                                               const bool single_submit,
-                                               const bool overlap_events,
-                                               const bool lock_memory)
-    : _block_size(block_size),
-      _queue_depth(queue_depth),
-      _single_submit(single_submit),
-      _overlap_events(overlap_events),
-      _lock_memory(lock_memory)
-{
-}
-
-void deepspeed_aio_latency_t::dump(const std::string tag)
-{
-    std::cout << tag << _min_usec << " " << _max_usec << " " << _avg_usec << " " << std::endl;
-}
-
-void deepspeed_aio_latency_t::accumulate(const struct deepspeed_aio_latency_t& other)
-{
-    _min_usec += other._min_usec;
-    _max_usec += other._max_usec;
-    _avg_usec += other._avg_usec;
-}
-
-void deepspeed_aio_latency_t::scale(const float scaler)
-{
-    _min_usec *= scaler;
-    _max_usec *= scaler;
-    _avg_usec *= scaler;
-}
-
-aio_context::aio_context(const int block_size, const int queue_depth)
-{
-    _block_size = block_size;
-    _queue_depth = queue_depth;
-    for (auto i = 0; i < queue_depth; ++i) {
-        _iocbs.push_back((struct iocb*)calloc(1, sizeof(struct iocb)));
-    }
-    _io_events.resize(queue_depth);
-    io_queue_init(queue_depth, &_io_ctxt);
-}
-
-aio_context::~aio_context()
-{
-    for (auto& iocb : _iocbs) { free(iocb); }
-    _io_events.resize(0);
-    io_queue_release(_io_ctxt);
-}
--- a/deepspeed/ops/csrc/aio/common/deepspeed_aio_types.h
+++ b/deepspeed/ops/csrc/aio/common/deepspeed_aio_types.h
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <libaio.h>
-#include <stdlib.h>
-
-#include <string>
-#include <vector>
-
-using namespace std;
-
-struct deepspeed_aio_latency_t {
-    double _min_usec;
-    double _max_usec;
-    double _avg_usec;
-
-    void dump(const std::string tag);
-    void accumulate(const deepspeed_aio_latency_t&);
-    void scale(const float value);
-};
-
-struct deepspeed_aio_perf_t {
-    deepspeed_aio_latency_t _submit;
-    deepspeed_aio_latency_t _complete;
-    double _e2e_usec;
-    double _e2e_rate_GB;
-};
-
-struct deepspeed_aio_config_t {
-    const int _block_size;
-    const int _queue_depth;
-    const bool _single_submit;
-    const bool _overlap_events;
-    const bool _lock_memory;
-
-    deepspeed_aio_config_t();
-    deepspeed_aio_config_t(const int block_size,
-                           const int queue_depth,
-                           const bool single_submit,
-                           const bool overlap_events,
-                           const bool lock_memory);
-};
-
-struct aio_context {
-    io_context_t _io_ctxt;
-    std::vector<struct io_event> _io_events;
-    std::vector<struct iocb*> _iocbs;
-    int _block_size;
-    int _queue_depth;
-
-    aio_context(const int block_size, const int queue_depth);
-    ~aio_context();
-};
--- a/deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.cpp
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <cmath>
-
-#include "deepspeed_aio_utils.h"
-
-using namespace std;
-
-const int c_block_size = 128 * 1024;
-const int c_io_queue_depth = 8;
-
-io_xfer_ctxt::io_xfer_ctxt(const int fd,
-                           const long long int file_offset,
-                           const long long int num_bytes,
-                           const void* buffer)
-    : _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes)
-{
-}
-
-io_prep_context::io_prep_context(const bool read_op,
-                                 const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                                 const size_t block_size,
-                                 const std::vector<struct iocb*>* iocbs)
-    : _read_op(read_op), _xfer_ctxt(xfer_ctxt), _block_size(block_size), _iocbs(iocbs)
-{
-}
-
-void io_prep_context::prep_iocbs(const int n_iocbs,
-                                 const size_t num_bytes,
-                                 const void* start_buffer,
-                                 const long long int start_offset)
-{
-    assert(static_cast<size_t>(n_iocbs) <= _iocbs->size());
-    for (auto i = 0; i < n_iocbs; ++i) {
-        const auto shift = i * _block_size;
-        const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_base_offset + shift;
-        const auto xfer_offset = _xfer_ctxt->_base_offset + start_offset + shift;
-        auto byte_count = _block_size;
-        if ((shift + _block_size) > num_bytes) { byte_count = num_bytes - shift; }
-
-        if (_read_op) {
-            io_prep_pread(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
-        } else {
-            io_prep_pwrite(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
-        }
-    }
-}
-
-io_prep_generator::io_prep_generator(const bool read_op,
-                                     const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                                     const size_t block_size)
-    : _read_op(read_op),
-      _xfer_ctxt(xfer_ctxt),
-      _block_size(block_size),
-      _remaining_bytes(xfer_ctxt->_num_bytes),
-      _next_iocb_index(0)
-{
-    _num_io_blocks =
-        static_cast<long long int>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
-    _remaining_io_blocks = _num_io_blocks;
-}
-
-int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs)
-{
-    if ((_remaining_bytes) == 0 || (_remaining_io_blocks == 0)) {
-        assert(static_cast<long long int>(_remaining_bytes) == _remaining_io_blocks);
-        return 0;
-    }
-
-    assert(static_cast<size_t>(n_iocbs) <= iocbs->size());
-
-    auto actual_n_iocbs = min(static_cast<long long int>(n_iocbs), _remaining_io_blocks);
-    for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) {
-        const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size);
-        const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset;
-        const auto num_bytes = min(static_cast<long long int>(_block_size), _remaining_bytes);
-
-        if (_read_op) {
-            io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
-        } else {
-            io_prep_pwrite(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
-        }
-        _remaining_bytes -= num_bytes;
-    }
-    _remaining_io_blocks -= actual_n_iocbs;
-
-    return actual_n_iocbs;
-}
-
-int get_file_size(const char* filename, long long int& size)
-{
-    struct stat st;
-    if (stat(filename, &st) == -1) { return -1; }
-    size = st.st_size;
-    return 0;
-}
-
-void* ds_page_aligned_alloc(const size_t size, const bool lock)
-{
-    void* ptr;
-    int retval;
-
-    retval = posix_memalign(&ptr, (size_t)sysconf(_SC_PAGESIZE), size);
-    if (retval) { return nullptr; }
-
-    if (lock == false) { return ptr; }
-
-    auto mlock_ret = mlock(ptr, size);
-    if (mlock_ret != 0) {
-        auto mlock_error = errno;
-        printf("mlock failed with %d %s\n", mlock_error, strerror(mlock_error));
-
-        free(ptr);
-        return nullptr;
-    }
-
-    return ptr;
-}
--- a/deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.h
+++ b/deepspeed/ops/csrc/aio/common/deepspeed_aio_utils.h
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#pragma once
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <libaio.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <deepspeed_aio_types.h>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-
-struct io_xfer_ctxt {
-    const int _fd;
-    const long long int _base_offset;
-    const void* _mem_buffer;
-    const long long int _num_bytes;
-
-    io_xfer_ctxt(const int fd,
-                 const long long int file_offset,
-                 const long long int num_bytes,
-                 const void* buffer);
-};
-
-struct io_prep_context {
-    const bool _read_op;
-    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
-    const size_t _block_size;
-    const std::vector<struct iocb*>* _iocbs;
-
-    io_prep_context(const bool read_op,
-                    const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                    const size_t block_size,
-                    const std::vector<struct iocb*>* iocbs);
-
-    void prep_iocbs(const int n_iocbs,
-                    const size_t num_bytes,
-                    const void* start_buffer,
-                    const long long int start_offset);
-};
-
-struct io_prep_generator {
-    const bool _read_op;
-    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
-    const size_t _block_size;
-
-    long long int _remaining_bytes;
-    long long int _num_io_blocks;
-    long long int _remaining_io_blocks;
-    long long int _next_iocb_index;
-
-    io_prep_generator(const bool read_op,
-                      const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
-                      const size_t block_size);
-
-    int prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs);
-};
-
-void* ds_page_aligned_alloc(const size_t size, const bool lock = false);
-
-int get_file_size(const char* filename, long long int& size);
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.cpp
+++ b/deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.cpp
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include "deepspeed_aio_thread.h"
-
-using namespace std;
-
-io_op_desc_t::io_op_desc_t(const bool read_op,
-                           const torch::Tensor& buffer,
-                           const int fd,
-                           const char* filename,
-                           const long long int num_bytes,
-                           const bool validate)
-    : _read_op(read_op),
-      _buffer(buffer),
-      _fd(fd),
-      _filename(filename),
-      _num_bytes(num_bytes),
-      _validate(validate)
-{
-    _cpu_buffer = _buffer.is_cuda() ? _buffer.to(torch::kCPU).pin_memory() : _buffer;
-    _contiguous_buffer = _cpu_buffer.contiguous();
-}
-
-char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
-
-void io_op_desc_t::fini()
-{
-    if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
-}
-
-deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config)
-    : _tid(tid),
-      _aio_config(aio_config),
-      _aio_ctxt(new aio_context(aio_config._block_size, aio_config._queue_depth)),
-      _time_to_exit(false)
-{
-}
-
-deepspeed_aio_thread_t::~deepspeed_aio_thread_t() {}
-
-void deepspeed_aio_thread_t::run()
-{
-    while (true) {
-        std::shared_ptr<struct io_op_desc_t> next_io_op = nullptr;
-
-        {
-            std::unique_lock<std::mutex> lock(_work_sync._mutex);
-            _work_sync._cond_var.wait(lock,
-                                      [this] { return (!_work_queue.empty() || _time_to_exit); });
-            if (!_work_queue.empty()) {
-                next_io_op = _work_queue.front();
-                _work_queue.pop();
-            }
-        }
-
-        if (next_io_op) {
-            const auto base_offset = next_io_op->_num_bytes * _tid;
-
-            std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
-                next_io_op->_fd, base_offset, next_io_op->_num_bytes, next_io_op->data_ptr()));
-
-            if (_aio_config._overlap_events) {
-                do_aio_operation_overlap(
-                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-            } else {
-                do_aio_operation_sequential(
-                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-            }
-
-            {
-                std::lock_guard<std::mutex> lock(_complete_sync._mutex);
-                _complete_queue.push(next_io_op);
-            }
-            _complete_sync._cond_var.notify_one();
-        }
-
-        if (_time_to_exit) { break; }
-    }
-}
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.h
+++ b/deepspeed/ops/csrc/aio/py_lib/deepspeed_aio_thread.h
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <condition_variable>
-#include <memory>
-#include <queue>
-#include "deepspeed_py_aio.h"
-
-struct io_op_desc_t {
-    const bool _read_op;
-    torch::Tensor _buffer;
-    int _fd;
-    const std::string _filename;
-    const long long int _num_bytes;
-    torch::Tensor _cpu_buffer;
-    torch::Tensor _contiguous_buffer;
-    const bool _validate;
-
-    io_op_desc_t(const bool read_op,
-                 const torch::Tensor& buffer,
-                 const int fd,
-                 const char* filename,
-                 const long long int num_bytes,
-                 const bool validate);
-
-    char* data_ptr() const;
-    void fini();
-};
-
-struct thread_sync_t {
-    std::mutex _mutex;
-    std::condition_variable _cond_var;
-};
-
-struct deepspeed_aio_thread_t {
-    const int _tid;
-    deepspeed_aio_config_t& _aio_config;
-
-    std::unique_ptr<struct aio_context> _aio_ctxt;
-    std::queue<std::shared_ptr<struct io_op_desc_t>> _work_queue;
-    std::queue<std::shared_ptr<struct io_op_desc_t>> _complete_queue;
-
-    bool _time_to_exit;
-
-    struct thread_sync_t _work_sync;
-    struct thread_sync_t _complete_sync;
-
-    deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config);
-
-    ~deepspeed_aio_thread_t();
-
-    void run();
-};
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.cpp
-
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <cassert>
-#include <chrono>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "deepspeed_py_aio.h"
-
-using namespace std;
-using namespace std::chrono;
-
-#define DEBUG_DS_AIO_READ 0
-#define DEBUG_DS_AIO_WRITE 0
-
-static const std::string c_library_name = "deepspeed_aio";
-
-int deepspeed_py_aio_write(const torch::Tensor& buffer,
-                           const char* filename,
-                           const int block_size,
-                           const int queue_depth,
-                           const bool single_submit,
-                           const bool overlap_events,
-                           const bool validate)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
-    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
-
-    if (config._overlap_events) {
-        do_aio_operation_overlap(false, aio_ctxt, xfer_ctxt, &config, nullptr);
-    } else {
-        do_aio_operation_sequential(false, aio_ctxt, xfer_ctxt, &config, nullptr);
-    }
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    close(fd);
-
-    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
-
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
-
-int deepspeed_py_aio_read(torch::Tensor& buffer,
-                          const char* filename,
-                          const int block_size,
-                          const int queue_depth,
-                          const bool single_submit,
-                          const bool overlap_events,
-                          const bool validate)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-
-    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto read_buffer = (char*)buffer.data_ptr();
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
-    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
-
-    if (config._overlap_events) {
-        do_aio_operation_overlap(true, aio_ctxt, xfer_ctxt, &config, nullptr);
-    } else {
-        do_aio_operation_sequential(true, aio_ctxt, xfer_ctxt, &config, nullptr);
-    }
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    close(fd);
-
-    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
-
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.h
+++ b/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio.h
-
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <deepspeed_aio_common.h>
-#include <stdlib.h>
-#include <torch/extension.h>
-
-int deepspeed_py_aio_write(const torch::Tensor& buffer,
-                           const char* filename,
-                           const int block_size,
-                           const int queue_depth,
-                           const bool single_submit,
-                           const bool overlap_events,
-                           const bool validate);
-
-int deepspeed_py_aio_read(torch::Tensor& buffer,
-                          const char* filename,
-                          const int block_size,
-                          const int queue_depth,
-                          const bool single_submit,
-                          const bool overlap_events,
-                          const bool validate);
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
-
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include "deepspeed_py_aio_handle.h"
-
-using namespace std;
-
-static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
-
-deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
-                                               const int queue_depth,
-                                               const bool single_submit,
-                                               const bool overlap_events,
-                                               const int num_threads)
-    : _aio_ctxt(new aio_context(block_size, queue_depth)),
-      _single_submit(single_submit),
-      _overlap_events(overlap_events),
-      _num_threads(num_threads),
-      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
-      _num_pending_ops(0)
-{
-    for (auto i = 0; i < num_threads; ++i) {
-        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
-    }
-
-    for (auto& ctxt : _thread_contexts) {
-        _threads.push_back(std::thread(_start_aio_thread, ctxt));
-    }
-}
-
-deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
-{
-    _stop_threads();
-    for (auto& thr : _threads) { thr.join(); }
-}
-
-const int deepspeed_aio_handle_t::get_block_size() const
-{
-    return _aio_ctxt ? _aio_ctxt->_block_size : -1;
-}
-
-const int deepspeed_aio_handle_t::get_queue_depth() const
-{
-    return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
-}
-
-const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_submit; }
-
-const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; }
-
-const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; }
-
-int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-
-    assert(_aio_ctxt);
-
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto read_buffer = (char*)buffer.data_ptr();
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
-
-    if (_aio_config._overlap_events) {
-        do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    } else {
-        do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    }
-
-    close(fd);
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
-
-int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate)
-{
-    assert(_aio_ctxt);
-
-    const auto start_time = std::chrono::high_resolution_clock::now();
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
-
-    if (_aio_config._overlap_events) {
-        do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    } else {
-        do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    }
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    close(fd);
-
-    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
-
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
-
-void deepspeed_aio_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
-{
-    for (auto& ctxt : _thread_contexts) {
-        {
-            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
-            ctxt->_work_queue.push(scheduled_op);
-        }
-        ctxt->_work_sync._cond_var.notify_one();
-    }
-    _num_pending_ops++;
-}
-
-std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_wait_for_aio_work()
-{
-    std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
-    for (auto& ctxt : _thread_contexts) {
-        std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
-        ctxt->_complete_sync._cond_var.wait(lock,
-                                            [ctxt] { return !ctxt->_complete_queue.empty(); });
-        completed_op = ctxt->_complete_queue.front();
-        ctxt->_complete_queue.pop();
-    }
-    return completed_op;
-}
-
-void deepspeed_aio_handle_t::_stop_threads()
-{
-    assert(0 == _num_pending_ops);
-    for (auto& ctxt : _thread_contexts) {
-        {
-            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
-            ctxt->_time_to_exit = true;
-        }
-        ctxt->_work_sync._cond_var.notify_one();
-    }
-}
-
-int deepspeed_aio_handle_t::wait()
-{
-    assert(_num_pending_ops > 0);
-    auto num_completed_ops = 0;
-
-    while (_num_pending_ops > 0) {
-        auto completed_op = _wait_for_aio_work();
-
-        completed_op->fini();
-
-        close(completed_op->_fd);
-
-        if (completed_op->_validate) {
-            validate_aio_operation(completed_op->_read_op,
-                                   completed_op->_filename.c_str(),
-                                   completed_op->data_ptr(),
-                                   _num_threads * completed_op->_num_bytes);
-        }
-        --_num_pending_ops;
-        ++num_completed_ops;
-    }
-
-    return num_completed_ops;
-}
-
-bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
-                                                       const long long int num_bytes)
-{
-    const auto op_string = read_op ? "Read" : "Write";
-    if (num_bytes % get_thread_count()) {
-        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
-                  << " not divisible by thread count = " << get_thread_count() << std::endl;
-        return false;
-    }
-
-    return true;
-}
-
-int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate,
-                                  const bool async)
-{
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
-    if (buffer_bytes != num_file_bytes) {
-        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
-                  << " != " << num_file_bytes << std::endl;
-    }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-    assert((num_file_bytes % _num_threads) == 0);
-
-    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
-
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto scheduled_op = std::make_shared<io_op_desc_t>(
-        true, buffer, fd, filename, (num_file_bytes / _num_threads), validate);
-
-    _schedule_aio_work(scheduled_op);
-
-    if (async) { return 0; }
-
-    return wait();
-}
-
-int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
-                                   const char* filename,
-                                   const bool validate,
-                                   const bool async)
-{
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    assert((num_write_bytes % _num_threads) == 0);
-
-    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto scheduled_op = std::make_shared<io_op_desc_t>(
-        false, buffer, fd, filename, (num_write_bytes / _num_threads), validate);
-
-    _schedule_aio_work(scheduled_op);
-
-    if (async) { return 0; }
-
-    return wait();
-}
-
-int deepspeed_aio_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
-{
-    return pread(buffer, filename, false, false);
-}
-
-int deepspeed_aio_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
-{
-    return pwrite(buffer, filename, false, false);
-}
-
-int deepspeed_aio_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
-{
-    return pread(buffer, filename, false, true);
-}
-
-int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
-{
-    return pwrite(buffer, filename, false, true);
-}
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_aio_handle.h
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <condition_variable>
-#include <memory>
-#include "deepspeed_aio_thread.h"
-
-struct deepspeed_aio_handle_t {
-    std::unique_ptr<struct aio_context> _aio_ctxt;
-    const bool _single_submit;
-    const bool _overlap_events;
-    const int _num_threads;
-    deepspeed_aio_config_t _aio_config;
-
-    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
-    std::vector<std::thread> _threads;
-    int _num_pending_ops;
-
-    deepspeed_aio_handle_t(const int block_size,
-                           const int queue_depth,
-                           const bool single_submit,
-                           const bool overlap_events,
-                           const int num_threads);
-
-    ~deepspeed_aio_handle_t();
-
-    const int get_block_size() const;
-    const int get_queue_depth() const;
-    const bool get_single_submit() const;
-    const bool get_overlap_events() const;
-    const int get_thread_count() const;
-
-    int read(torch::Tensor& buffer, const char* filename, const bool validate);
-
-    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
-
-    int pread(const torch::Tensor& buffer,
-              const char* filename,
-              const bool validate,
-              const bool async);
-
-    int pwrite(const torch::Tensor& buffer,
-               const char* filename,
-               const bool validate,
-               const bool async);
-
-    int sync_pread(torch::Tensor& buffer, const char* filename);
-
-    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
-
-    int async_pread(torch::Tensor& buffer, const char* filename);
-
-    int async_pwrite(const torch::Tensor& buffer, const char* filename);
-
-    int wait();
-
-    void _stop_threads();
-
-    void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
-
-    std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
-
-    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
-};
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ b/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.cpp
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include "deepspeed_py_copy.h"
-#include <omp.h>
-
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
-
-#if defined(__AVX512__) or defined(__AVX256__)
-union AVX_Data {
-#if defined(__AVX512__)
-    __m512 data;
-#else
-    __m256 data;
-#endif
-};
-#endif
-
-static void helper_memcpy_1(float* dest, float* src, size_t param_size)
-{
-    size_t rounded_size = 0;
-
-#if defined(__AVX512__) or defined(__AVX256__)
-
-    rounded_size = ROUND_DOWN(param_size, SIMD_WIDTH);
-
-    for (size_t t = 0; t < rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
-        size_t offset = copy_size + t;
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += SIMD_WIDTH) {
-            AVX_Data src_4;
-            src_4.data = SIMD_LOAD(src + i);
-
-            SIMD_STORE(dest + i, src_4.data);
-        }
-    }
-
-#endif
-
-    if (param_size > rounded_size) {
-#pragma omp parallel for
-        for (size_t k = rounded_size; k < param_size; k++) { dest[k] = src[k]; }
-    }
-}
-
-static void helper_memcpy_4(float* dest, float* src, size_t param_size)
-{
-    size_t rounded_size = 0;
-
-#if defined(__AVX512__) or defined(__AVX256__)
-
-    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
-
-    for (size_t t = 0; t < rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
-        size_t offset = copy_size + t;
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
-            AVX_Data src_4[4];
-            src_4[0].data = SIMD_LOAD(src + i);
-            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
-            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
-            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
-
-            SIMD_STORE(dest + i, src_4[0].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
-            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
-        }
-    }
-#endif
-    if (param_size > rounded_size)
-        helper_memcpy_1((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
-}
-
-static void helper_mempcy_8(float* dest, float* src, size_t param_size)
-{
-    size_t rounded_size = 0;
-
-#if defined(__AVX512__) or defined(__AVX256__)
-
-    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
-
-    for (size_t t = 0; t < rounded_size; t += TILE) {
-        size_t copy_size = TILE;
-        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
-        size_t offset = copy_size + t;
-#pragma omp parallel for
-        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
-            AVX_Data src_4[8];
-            src_4[0].data = SIMD_LOAD(src + i);
-            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
-            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
-            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
-            src_4[4].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 2));
-            src_4[5].data = SIMD_LOAD(src + i + SIMD_WIDTH * 5);
-            src_4[6].data = SIMD_LOAD(src + i + SIMD_WIDTH * 6);
-            src_4[7].data = SIMD_LOAD(src + i + SIMD_WIDTH * 7);
-
-            SIMD_STORE(dest + i, src_4[0].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
-            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
-            SIMD_STORE(dest + i + (SIMD_WIDTH << 2), src_4[4].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 5, src_4[5].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 6, src_4[6].data);
-            SIMD_STORE(dest + i + SIMD_WIDTH * 7, src_4[7].data);
-        }
-    }
-#endif
-    if (param_size > rounded_size)
-        helper_memcpy_4((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
-}
-
-int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src)
-{
-    auto dest_c = dest.contiguous();
-    auto src_c = src.contiguous();
-
-    float* dest_ptr = (float*)dest_c.data_ptr();
-    float* src_ptr = (float*)src_c.data_ptr();
-
-    helper_mempcy_8(dest_ptr, src_ptr, dest_c.size(0));
-
-    return 0;
-}
--- a/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.h
+++ b/deepspeed/ops/csrc/aio/py_lib/deepspeed_py_copy.h
-
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#if (__x86_64__ || __i386__)
-#include <cpuid.h>
-#include <x86intrin.h>
-#endif
-
-#include <deepspeed_aio_common.h>
-#include <stdlib.h>
-#include <torch/extension.h>
-
-#define TILE (1024 * 1024 * 1024)
-
-#if defined(__AVX512__)
-#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
-#define SIMD_LOAD(x) _mm512_loadu_ps(x)
-#define SIMD_SET(x) _mm512_set1_ps(x)
-#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
-#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
-#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
-#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
-#define SIMD_WIDTH 16
-#else
-#if defined(__AVX256__)
-#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
-#define SIMD_LOAD(x) _mm256_loadu_ps(x)
-#define SIMD_SET(x) _mm256_set1_ps(x)
-#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
-#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
-#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
-#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
-#define SIMD_WIDTH 8
-#endif
-#endif
-
-int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src);
--- a/deepspeed/ops/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/deepspeed/ops/csrc/aio/py_lib/py_ds_aio.cpp
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
-*/
-
-#include <torch/extension.h>
-#include "deepspeed_py_aio_handle.h"
-#include "deepspeed_py_copy.h"
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchronous I/O Read");
-
-    m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchronous I/O Write");
-
-    m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
-
-    py::class_<deepspeed_aio_handle_t>(m, "aio_handle")
-        .def(py::init<const int, const int, const bool, const bool, const int>())
-
-        .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
-        .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
-        .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
-        .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
-        .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
-
-        .def("read", &deepspeed_aio_handle_t::read)
-        .def("write", &deepspeed_aio_handle_t::write)
-
-        .def("pread", &deepspeed_aio_handle_t::pread)
-        .def("pwrite", &deepspeed_aio_handle_t::pwrite)
-
-        .def("sync_pread", &deepspeed_aio_handle_t::sync_pread)
-        .def("sync_pwrite", &deepspeed_aio_handle_t::sync_pwrite)
-        .def("async_pread", &deepspeed_aio_handle_t::async_pread)
-        .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
-
-        .def("wait", &deepspeed_aio_handle_t::wait);
-}
--- a/deepspeed/ops/csrc/aio/py_test/aio_bench_generate_param.py
+++ b/deepspeed/ops/csrc/aio/py_test/aio_bench_generate_param.py
-"""
-Copyright 2021 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-import os
-import argparse
-import json
-from parse_aio_stats import READ_SPEED, WRITE_SPEED, get_sorted_results
-from perf_sweep_utils import BENCH_LOG_DIR, READ_LOG_DIR, WRITE_LOG_DIR
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        '--log_dir',
-        type=str,
-        default=BENCH_LOG_DIR,
-        help=
-        f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}'
-    )
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-
-    return args
-
-
-def validate_args(args):
-    for d in [READ_LOG_DIR, WRITE_LOG_DIR]:
-        log_dir = os.path.join(args.log_dir, d)
-        if not os.path.isdir(log_dir):
-            print(f'{log_dir} folder is not existent')
-            return False
-
-    return True
-
-
-def convert_to_param(key):
-    assert len(key) == 6
-    return {
-        "single_submit": "true" if key[0] == "single" else "false",
-        "overlap_events": "true" if key[1] == "overlap" else "false",
-        "thread_count": int(key[3]),
-        "queue_depth": int(key[4]),
-        "block_size": int(key[5])
-    }
-
-
-def generate_aio_param(read_log_dir, write_log_dir):
-    _, read_results = get_sorted_results(read_log_dir, READ_SPEED)
-    _, write_results = get_sorted_results(write_log_dir, WRITE_SPEED)
-    combined_perf = {key[1:]: value for key, value in read_results.items()}
-
-    for key, value in write_results.items():
-        new_key = key[1:]
-        if new_key in combined_perf:
-            combined_perf[new_key] += value
-        else:
-            combined_perf[new_key] = 0
-
-    optimal_key = None
-    optimal_perf = 0.0
-    for key, value in combined_perf.items():
-        if value > optimal_perf:
-            optimal_perf = value
-            optimal_key = key
-
-    aio_param = {"aio": convert_to_param(optimal_key)}
-
-    read_perf_keys = {key[1:]: key for key in read_results.keys()}
-    write_perf_keys = {key[1:]: key for key in write_results.keys()}
-    optimal_config_read = read_results.get(read_perf_keys[optimal_key], None)
-    optimal_config_write = write_results.get(write_perf_keys[optimal_key], None)
-
-    print(
-        f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}'
-    )
-    print(json.dumps(aio_param, indent=3))
-
-
-def main():
-    print('Generate aio param')
-    args = parse_arguments()
-    if not validate_args(args):
-        quit()
-
-    read_log_dir = os.path.join(args.log_dir, READ_LOG_DIR)
-    write_log_dir = os.path.join(args.log_dir, WRITE_LOG_DIR)
-    generate_aio_param(read_log_dir, write_log_dir)
-
-
-if __name__ == "__main__":
-    main()
--- a/deepspeed/ops/csrc/aio/py_test/aio_bench_perf_sweep.py
+++ b/deepspeed/ops/csrc/aio/py_test/aio_bench_perf_sweep.py
-"""
-Copyright 2021 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-import os
-import sys
-import argparse
-import json
-import itertools
-import subprocess
-import shutil
-
-from test_ds_aio_utils import refine_integer_value
-from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
-    READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR
-
-OTHER_OPTIONS = '--handle'
-PERF_SCRIPT = 'test_ds_aio.py'
-DEFAULT_SWEEP_CONFIG = {
-    "block_size": ["128K",
-                   "256K"],
-    "queue_depth": [4,
-                    16,
-                    32],
-    "overlap_events": [True,
-                       False],
-    "io_parallel": [2,
-                    8],
-    "single_submit": [False]
-}
-
-
-class Job(object):
-    def __init__(self, cmd_line, output_file=None, work_dir=None):
-        self.cmd_line = cmd_line
-        self.output_file = output_file
-        self.work_dir = work_dir
-        self.output_fd = None
-
-    def cmd(self):
-        return self.cmd_line
-
-    def get_stdout(self):
-        return self.output_fd
-
-    def get_stderr(self):
-        return self.output_fd
-
-    def get_cwd(self):
-        return self.work_dir
-
-    def open_output_file(self):
-        if self.output_file is not None:
-            self.output_fd = open(self.output_file, 'w')
-
-    def close_output_file(self):
-        if self.output_fd is not None:
-            self.output_fd.close()
-            self.output_fd = None
-
-
-class SweepConfig(object):
-    def __init__(self, args):
-        self.nvme_dir = args.nvme_dir
-        self.io_size = args.io_size
-        self.search_space = get_sweep_config_dict(args.sweep_config)
-        self.read = not args.no_read
-        self.write = not args.no_write
-        self.flush_cache = not args.no_sudo
-        self.log_dir = args.log_dir
-        self.loops = args.loops
-        self.other_options = f'{OTHER_OPTIONS} --loops {args.loops}'
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        '--nvme_dir',
-        required=True,
-        type=str,
-        help=
-        'Directory in which to perform I/O tests. A writeable directory on a NVMe device.'
-    )
-
-    parser.add_argument('--sweep_config',
-                        type=str,
-                        default=None,
-                        help='Performance sweep configuration json file.')
-
-    parser.add_argument('--no_read',
-                        action='store_true',
-                        help='Disable read performance measurements.')
-
-    parser.add_argument('--no_write',
-                        action='store_true',
-                        help='Disable write performance measurements.')
-
-    parser.add_argument(
-        '--io_size',
-        type=str,
-        default="400M",
-        help='Number of I/O bytes to read/write for performance measurements.')
-
-    parser.add_argument(
-        '--no_sudo',
-        action='store_true',
-        help=
-        'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.'
-    )
-
-    parser.add_argument(
-        '--log_dir',
-        type=str,
-        default=BENCH_LOG_DIR,
-        help=
-        f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}'
-    )
-
-    parser.add_argument('--loops',
-                        type=int,
-                        default=1,
-                        help='Count of operation repetitions')
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-
-    return args
-
-
-def dump_cmd_lines(cmd_lines):
-    print(f'cmd line count =  {len(cmd_lines)}')
-    for i, cmd in enumerate(cmd_lines):
-        print(f'{i}:  {cmd}')
-
-
-def get_sweep_config_dict(sweep_config_json):
-    if sweep_config_json is None:
-        return DEFAULT_SWEEP_CONFIG
-
-    with open(sweep_config_json) as fp:
-        sweep_config = json.load(fp)
-    return sweep_config
-
-
-def get_sweep_cmd_lines(sweep_config_dict):
-    def flatten_options(key, value_list):
-        flat_list = []
-        for v in value_list:
-            if not type(v) is bool:
-                flat_list.append(f'--{key} {v}')
-            elif v:
-                flat_list.append(f'--{key}')
-            else:
-                flat_list.append(' ')
-
-        return flat_list
-
-    flat_list = [flatten_options(key, value) for key, value in sweep_config_dict.items()]
-    cmd_list = list(itertools.product(*flat_list))
-    cmd_list = [list(cmd) for cmd in cmd_list]
-    #dump_cmd_lines(cmd_list)
-    return cmd_list
-
-
-def run_job(job):
-    args = ' '.join(job.cmd())
-    print(f'args = {args}')
-    job.open_output_file()
-    proc = subprocess.run(args=args,
-                          shell=True,
-                          stdout=job.get_stdout(),
-                          stderr=job.get_stderr(),
-                          cwd=job.get_cwd())
-    job.close_output_file()
-    assert proc.returncode == 0, \
-    f"This command failed: {job.cmd()}"
-
-
-def launch_sweep(sweep_jobs, sync_job, flush_cache_job):
-    for perf_job in sweep_jobs:
-        if flush_cache_job is not None:
-            run_job(sync_job)
-            run_job(flush_cache_job)
-
-        run_job(perf_job)
-
-        run_job(sync_job)
-
-
-def create_cmd_tags(cmd_line):
-    tags = {}
-    for param_value in cmd_line:
-        fields = param_value.split()
-        if len(fields) == 1:
-            tags[fields[0]] = None
-        elif len(fields) == 2:
-            tags[fields[0]] = fields[1]
-    return tags
-
-
-def get_log_file(io_op_desc, cmd_line):
-    QUEUE_DEPTH = "--queue_depth"
-    BLOCK_SIZE = "--block_size"
-    SINGLE_SUBMIT = "--single_submit"
-    OVERLAP_EVENTS = "--overlap_events"
-    THREAD_COUNT = "--threads"
-    IO_PARALLEL = "--io_parallel"
-
-    tag_map = {
-        QUEUE_DEPTH: "d",
-        BLOCK_SIZE: "bs",
-        SINGLE_SUBMIT: "single",
-        OVERLAP_EVENTS: "overlap",
-        THREAD_COUNT: "t",
-        IO_PARALLEL: "p"
-    }
-
-    tag_default = {
-        QUEUE_DEPTH: 1,
-        BLOCK_SIZE: "1M",
-        SINGLE_SUBMIT: "block",
-        OVERLAP_EVENTS: "sequential",
-        THREAD_COUNT: 1,
-        IO_PARALLEL: 1
-    }
-
-    def get_default_value(tag):
-        value = tag_default[tag]
-        if tag in [SINGLE_SUBMIT, OVERLAP_EVENTS]:
-            return value
-        return f'{tag_map[tag]}{value}'
-
-    def get_config_value(tag, value):
-        tag_key = tag_map[tag]
-        if value is None:
-            return tag_key
-        return f'{tag_key}{value}'
-
-    tag_list = [
-        SINGLE_SUBMIT,
-        OVERLAP_EVENTS,
-        THREAD_COUNT,
-        IO_PARALLEL,
-        QUEUE_DEPTH,
-        BLOCK_SIZE
-    ]
-    log_tags = [io_op_desc]
-    cmd_tags = create_cmd_tags(cmd_line)
-    for tag in tag_list:
-        if tag in cmd_tags:
-            log_tags.append(get_config_value(tag, cmd_tags[tag]))
-        else:
-            log_tags.append(get_default_value(tag))
-
-    log_file = '_'.join(log_tags)
-    log_file += '.txt'
-    return log_file
-
-
-def create_perf_jobs(io_op_desc, log_dir, cmd_lines):
-    py_cmd = ['python', os.path.join(script_path(), PERF_SCRIPT)]
-
-    perf_jobs = []
-    for cmd in cmd_lines:
-        log_file = os.path.join(log_dir, get_log_file(io_op_desc, cmd))
-        job = Job(cmd_line=py_cmd + cmd, output_file=log_file)
-        perf_jobs.append(job)
-
-    return perf_jobs
-
-
-def script_path():
-    return os.path.dirname(os.path.realpath(sys.argv[0]))
-
-
-def async_io_setup():
-    import deepspeed
-    from deepspeed.ops.aio import AsyncIOBuilder
-    return AsyncIOBuilder().is_compatible()
-
-
-def get_block_size_and_count(io_bytes):
-    block_size = 1
-    block_count = io_bytes
-    bytes_in_KB = 1024
-
-    while block_count % bytes_in_KB == 0:
-        block_size *= bytes_in_KB
-        block_count /= bytes_in_KB
-
-    return int(block_size), int(block_count)
-
-
-def create_read_file(sweep_config):
-    read_folder = os.path.join(sweep_config.nvme_dir, f'{READ_IO_DIR}')
-    os.makedirs(read_folder, exist_ok=True)
-    read_file_name = os.path.join(read_folder, f'random_{sweep_config.io_size}B.pt')
-    block_size, block_count = get_block_size_and_count(refine_integer_value(sweep_config.io_size))
-    dd_job = Job(cmd_line=[
-        f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'
-    ])
-    print(
-        f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
-    )
-    run_job(dd_job)
-    print(
-        f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
-    )
-    return read_folder, read_file_name
-
-
-def remove_folder(folder):
-    assert os.path.isdir(folder), f"Error: cannot remove {folder} - folder not found"
-    shutil.rmtree(folder)
-
-
-def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
-    read_folder, read_file_name = create_read_file(sweep_config)
-    read_option = f'--read_file {read_file_name}'
-    read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd
-                      for cmd in cmd_lines]
-    #dump_cmd_lines(read_cmd_lines)
-
-    log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}')
-    os.makedirs(log_folder, exist_ok=True)
-
-    perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC,
-                                 log_dir=log_folder,
-                                 cmd_lines=read_cmd_lines)
-
-    launch_sweep(sweep_jobs=perf_jobs,
-                 sync_job=sync_job,
-                 flush_cache_job=flush_cache_job)
-
-    remove_folder(read_folder)
-
-
-def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
-    write_folder = os.path.join(sweep_config.nvme_dir, f'{WRITE_IO_DIR}')
-    os.makedirs(write_folder, exist_ok=True)
-    write_file_name = os.path.join(write_folder, f'random_{sweep_config.io_size}B.pt')
-    write_option = f'--write_size {sweep_config.io_size} --write_file {write_file_name}'
-    write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd
-                       for cmd in cmd_lines]
-    #dump_cmd_lines(write_cmd_lines)
-
-    log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}')
-    os.makedirs(log_folder, exist_ok=True)
-
-    perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC,
-                                 log_dir=log_folder,
-                                 cmd_lines=write_cmd_lines)
-
-    launch_sweep(sweep_jobs=perf_jobs,
-                 sync_job=sync_job,
-                 flush_cache_job=flush_cache_job)
-
-    remove_folder(write_folder)
-
-
-def main():
-    print("Running performance sweep of deepspeed nvme library")
-
-    if not async_io_setup():
-        error_msg = """
-            Failing because environment is not properly configured for deepspeed async i/o module.
-            Possible fix: apt install libaio-dev.
-        """
-        print(error_msg)
-        quit()
-
-    args = parse_arguments()
-    sweep_config = SweepConfig(args)
-    cmd_lines = get_sweep_cmd_lines(sweep_config.search_space)
-
-    if sweep_config.flush_cache:
-        flush_cache_job = Job(
-            cmd_line=['sudo',
-                      'bash -c',
-                      "'echo 1 > /proc/sys/vm/drop_caches'"])
-    else:
-        flush_cache_job = None
-
-    sync_job = Job(cmd_line=['sync'])
-
-    if sweep_config.read:
-        run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines)
-
-    if sweep_config.write:
-        run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines)
-
-
-if __name__ == "__main__":
-    main()
--- a/deepspeed/ops/csrc/aio/py_test/ds_aio_basic.py
+++ b/deepspeed/ops/csrc/aio/py_test/ds_aio_basic.py
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import torch
-import os
-import time
-from deepspeed.ops.aio import AsyncIOBuilder
-from multiprocessing import Pool, Barrier
-from test_ds_aio_utils import report_results, task_log, task_barrier
-
-
-def pre_basic(args, tid, read_op):
-    io_string = "Read" if read_op else "Write"
-    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
-    file = args.read_file if read_op else f'{args.write_file}.{tid}'
-
-    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
-    task_log(
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
-
-    ctxt = {}
-    ctxt['file'] = file
-    ctxt['num_bytes'] = num_bytes
-    ctxt['buffer'] = buffer
-    ctxt['elapsed_sec'] = 0
-
-    return ctxt
-
-
-def pre_basic_read(pool_params):
-    args, tid = pool_params
-    ctxt = pre_basic(args, tid, True)
-    return ctxt
-
-
-def pre_basic_write(pool_params):
-    args, tid = pool_params
-    ctxt = pre_basic(args, tid, False)
-    return ctxt
-
-
-def post_basic(pool_params):
-    _, _, ctxt = pool_params
-    ctxt["buffer"].detach()
-    ctxt["buffer"] = None
-    return ctxt
-
-
-def main_basic_read(pool_params):
-    args, tid, ctxt = pool_params
-    start_time = time.time()
-    AsyncIOBuilder().load().aio_read(ctxt['buffer'],
-                                     ctxt['file'],
-                                     args.block_size,
-                                     args.queue_depth,
-                                     args.single_submit,
-                                     args.overlap_events,
-                                     args.validate)
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_basic_write(pool_params):
-    args, tid, ctxt = pool_params
-    start_time = time.time()
-    AsyncIOBuilder().load().aio_write(ctxt['buffer'],
-                                      ctxt['file'],
-                                      args.block_size,
-                                      args.queue_depth,
-                                      args.single_submit,
-                                      args.overlap_events,
-                                      args.validate)
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def get_schedule(args, read_op):
-    schedule = {}
-    if read_op:
-        schedule['pre'] = pre_basic_read
-        schedule['post'] = post_basic
-        schedule['main'] = main_basic_read
-    else:
-        schedule['pre'] = pre_basic_write
-        schedule['post'] = post_basic
-        schedule['main'] = main_basic_write
-
-    return schedule
-
-
-def _aio_handle_tasklet(pool_params):
-    args, tid, read_op = pool_params
-
-    # Create schedule
-    schedule = get_schedule(args, read_op)
-    task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, args.threads)
-
-    # Run pre task
-    task_log(tid, f'running pre-task')
-    ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, args.threads)
-
-    # Run main tasks in a loop
-    ctxt["main_task_sec"] = 0
-    for i in range(args.loops):
-        task_log(tid, f'running main task {i}')
-        start_time = time.time()
-        ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, args.threads)
-        stop_time = time.time()
-        ctxt["main_task_sec"] += stop_time - start_time
-
-    # Run post task
-    task_log(tid, f'running post-task')
-    ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, args.threads)
-
-    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
-
-
-def _init_tasklet(b):
-    global aio_barrier
-    aio_barrier = b
-
-
-def aio_basic_multiprocessing(args, read_op):
-    b = Barrier(args.threads)
-    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
-        pool_results = p.map(_aio_handle_tasklet, pool_params)
-
-    report_results(args, read_op, pool_results)
--- a/deepspeed/ops/csrc/aio/py_test/ds_aio_handle.py
+++ b/deepspeed/ops/csrc/aio/py_test/ds_aio_handle.py
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import torch
-import os
-import time
-from multiprocessing import Pool, Barrier
-from deepspeed.ops.aio import AsyncIOBuilder
-from test_ds_aio_utils import report_results, task_log, task_barrier
-
-
-def pre_handle(args, tid, read_op):
-    io_string = "Read" if read_op else "Write"
-    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
-    file = args.read_file if read_op else f'{args.write_file}.{tid}'
-
-    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    if args.gpu:
-        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
-    else:
-        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
-    task_log(
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
-
-    io_parallel = args.io_parallel if args.io_parallel else 1
-    handle = AsyncIOBuilder().load().aio_handle(args.block_size,
-                                                args.queue_depth,
-                                                args.single_submit,
-                                                args.overlap_events,
-                                                io_parallel)
-    task_log(tid, f'created deepspeed aio handle')
-
-    ctxt = {}
-    ctxt['file'] = file
-    ctxt['num_bytes'] = num_bytes
-    ctxt['handle'] = handle
-    ctxt['buffer'] = buffer
-    ctxt['elapsed_sec'] = 0
-
-    return ctxt
-
-
-def pre_handle_read(pool_params):
-    args, tid = pool_params
-    ctxt = pre_handle(args, tid, True)
-    return ctxt
-
-
-def pre_handle_write(pool_params):
-    args, tid = pool_params
-    ctxt = pre_handle(args, tid, False)
-    return ctxt
-
-
-def post_handle(pool_params):
-    _, _, ctxt = pool_params
-    ctxt["buffer"].detach()
-    ctxt["buffer"] = None
-    return ctxt
-
-
-def main_parallel_read(pool_params):
-    args, tid, ctxt = pool_params
-    handle = ctxt['handle']
-
-    start_time = time.time()
-    ret = handle.pread(ctxt['buffer'], ctxt['file'], args.validate, True)
-    assert ret != -1
-    handle.wait()
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_parallel_write(pool_params):
-    args, tid, ctxt = pool_params
-    handle = ctxt['handle']
-    start_time = time.time()
-    ret = handle.pwrite(ctxt['buffer'], ctxt['file'], args.validate, True)
-    assert ret != -1
-    handle.wait()
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_handle_read(pool_parms):
-    args, tid, ctxt = pool_parms
-    handle = ctxt['handle']
-
-    start_time = time.time()
-    ret = handle.read(ctxt['buffer'], ctxt['file'], args.validate)
-    assert ret != -1
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def main_handle_write(pool_parms):
-    args, tid, ctxt = pool_parms
-    handle = ctxt['handle']
-    start_time = time.time()
-    ret = handle.write(ctxt['buffer'], ctxt['file'], args.validate)
-    assert ret != -1
-    end_time = time.time()
-    ctxt['elapsed_sec'] += end_time - start_time
-
-    return ctxt
-
-
-def get_schedule(args, read_op):
-    schedule = {}
-    if read_op:
-        schedule['pre'] = pre_handle_read
-        schedule['post'] = post_handle
-        schedule['main'] = main_parallel_read if args.io_parallel else main_handle_read
-    else:
-        schedule['pre'] = pre_handle_write
-        schedule['post'] = post_handle
-        schedule['main'] = main_parallel_write if args.io_parallel else main_handle_write
-
-    return schedule
-
-
-def _aio_handle_tasklet(pool_params):
-    args, tid, read_op = pool_params
-
-    # Create schedule
-    schedule = get_schedule(args, read_op)
-    task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, args.threads)
-
-    # Run pre task
-    task_log(tid, f'running pre-task')
-    ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, args.threads)
-
-    # Run main tasks in a loop
-    ctxt["main_task_sec"] = 0
-    for i in range(args.loops):
-        task_log(tid, f'running main task {i}')
-        start_time = time.time()
-        ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, args.threads)
-        stop_time = time.time()
-        ctxt["main_task_sec"] += stop_time - start_time
-
-    # Run post task
-    task_log(tid, f'running post-task')
-    ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, args.threads)
-
-    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
-
-
-def _init_tasklet(b):
-    global aio_barrier
-    aio_barrier = b
-
-
-def aio_handle_multiprocessing(args, read_op):
-    b = Barrier(args.threads)
-    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
-        pool_results = p.map(_aio_handle_tasklet, pool_params)
-
-    report_results(args, read_op, pool_results)
--- a/deepspeed/ops/csrc/aio/py_test/parse_aio_stats.py
+++ b/deepspeed/ops/csrc/aio/py_test/parse_aio_stats.py
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
-"""
-
-import os
-import argparse
-import re
-
-READ_SPEED = 'read_speed'
-WRITE_SPEED = 'write_speed'
-
-PERF_METRICS = [READ_SPEED, WRITE_SPEED]
-
-METRIC_SEARCH = {READ_SPEED: 'E2E Read Speed', WRITE_SPEED: 'E2E Write Speed'}
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--log_dir',
-                        type=str,
-                        required=True,
-                        help='Folder of statistics logs')
-
-    parser.add_argument('--metric',
-                        type=str,
-                        required=True,
-                        help='Performance metric to report: [read_speed|write_speed]')
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-
-    return args
-
-
-def extract_value(key, file):
-    INVALID_PREFIXES = ["ds"]
-    for p in INVALID_PREFIXES:
-        if key.startswith(p):
-            return key
-    try:
-        if key[0] in ['t', 'd', 'p']:
-            return int(key[1:])
-        if key.startswith("bs"):
-            if key.endswith('K'):
-                v = key[2:].split('K')
-                return int(v[0]) * 1024
-            elif key.endswith('M'):
-                v = key[2:].split('M')
-                return int(v[0]) * 1024 * 1024
-            else:
-                return int(key[2:])
-    except:
-        print(f"{file}: extract_value fails on {key}")
-        return None
-
-    return key
-
-
-def get_file_key(file):
-    f, _ = os.path.splitext(os.path.basename(file))
-    fields = f.split('_')
-    values = [extract_value(k, file) for k in fields]
-    return tuple(values)
-
-
-def get_thread_count(file):
-    f, _ = os.path.splitext(os.path.basename(file))
-    fields = f.split('_')
-    for key in fields:
-        if key[0] == 't':
-            return int(key[1:])
-    return 1
-
-
-"""
-Extract performance metric from log file.
-Sample file lines are:
-Task Read Latency = 0.031647682189941406 sec
-Task Read Speed = 12.342926020792527 GB/sec
-E2E Read Latency = 0.031697988510131836 sec
-E2E Read Speed = 12.323337169333062 GB/sec
-
-For the above sample, -metric = "read_speed" corresponds to "E2E Read Speed", and 12.32 will be returned
-"""
-
-
-def get_metric(file, metric):
-    thread_count = get_thread_count(file)
-    with open(file) as f:
-        for line in f.readlines():
-            if line.startswith(METRIC_SEARCH[metric]):
-                if metric in [READ_SPEED, WRITE_SPEED]:
-                    fields = line.split()
-                    return float(fields[-2])
-                else:
-                    fields = line.split('=')
-                    return float(fields[-1])
-
-    return None
-
-
-def validate_args(args):
-    if not args.metric in PERF_METRICS:
-        print(f'{args.metric} is not a valid performance metrics')
-        return False
-
-    if not os.path.isdir(args.log_dir):
-        print(f'{args.log_dir} folder is not existent')
-        return False
-
-    return True
-
-
-def get_results(log_files, metric):
-    results = {}
-    for f in log_files:
-        file_key = get_file_key(f)
-        value = get_metric(f, metric)
-        results[file_key] = value
-
-    return results
-
-
-def get_sorted_results(log_dir, metric):
-    log_files = [
-        f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir,
-                                                                      f))
-    ]
-
-    log_files_path = [os.path.join(log_dir, f) for f in log_files]
-    results = get_results(log_files_path, metric)
-    result_keys = list(results.keys())
-    sorted_keys = sorted(result_keys)
-    return sorted_keys, results
-
-
-def main():
-    print("Parsing aio statistics")
-    args = parse_arguments()
-
-    if not validate_args(args):
-        quit()
-
-    sorted_keys, results = get_sorted_results(args.log_dir, args.metric)
-    for k in sorted_keys:
-        print(f'{k} = {results[k]}')
-
-
-if __name__ == "__main__":
-    main()
--- a/deepspeed/ops/csrc/aio/py_test/perf_sweep_utils.py
+++ b/deepspeed/ops/csrc/aio/py_test/perf_sweep_utils.py
-SCRIPT_PREFIX = '_aio_bench'
-WRITE_OP_DESC = 'write'
-READ_OP_DESC = 'read'
-READ_IO_DIR = f'{SCRIPT_PREFIX}_{READ_OP_DESC}_io'
-WRITE_IO_DIR = f'{SCRIPT_PREFIX}_{WRITE_OP_DESC}_io'
-BENCH_LOG_DIR = f'{SCRIPT_PREFIX}_logs'
-READ_LOG_DIR = f'{SCRIPT_PREFIX}_{READ_OP_DESC}_logs'
-WRITE_LOG_DIR = f'{SCRIPT_PREFIX}_{WRITE_OP_DESC}_logs'
--- a/deepspeed/ops/csrc/aio/py_test/run_read_sweep.sh
+++ b/deepspeed/ops/csrc/aio/py_test/run_read_sweep.sh
-#!/bin/bash
-if [[ $# -ne 2 ]]; then
-    echo "Usage: $0 <input file> <output log dir>"
-    exit 1
-fi
-
-
-function validate_environment()
-{
-    validate_cmd="python ./validate_async_io.py"
-    eval ${validate_cmd}
-    res=$?
-    if [[ $res != 0 ]]; then
-        echo "Failing because environment is not properly configured"
-        echo "Possible fix: sudo apt-get install libaio-dev"
-        exit 1
-    fi
-}
-
-
-validate_environment
-
-INPUT_FILE=$1
-if [[ ! -f ${INPUT_FILE} ]]; then
-    echo "Input file not found: ${INPUT_FILE}"
-    exit 1
-fi
-
-LOG_DIR=$2/aio_perf_sweep
-RUN_SCRIPT=./test_ds_aio.py
-READ_OPT="--read_file ${INPUT_FILE}"
-
-if [[ -d ${LOG_DIR} ]]; then
-    rm -f ${LOG_DIR}/*
-else
-    mkdir -p ${LOG_DIR}
-fi
-
-DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
-SYNC="sync"
-
-for sub in single block; do
-    if [[ $sub == "single" ]]; then
-        sub_opt="--single_submit"
-    else
-        sub_opt=""
-    fi
-    for ov in overlap sequential; do
-        if [[ $ov == "overlap" ]]; then
-            ov_opt="--overlap_events"
-        else
-            ov_opt=""
-        fi
-        for t in 1 2 4 8; do
-            for p in 1 ; do
-                for d in 1 2 4 8 16 32; do
-                    for bs in 128K 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
-                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
-                        LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
-                        cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
-                        echo ${DISABLE_CACHE}
-                        echo ${cmd}
-                        echo ${SYNC}
-
-                        eval ${DISABLE_CACHE}
-                        eval ${cmd}
-                        eval ${SYNC}
-                        sleep 2
-                    done
-                done
-            done
-        done
-    done
-done