push dsv0.8.2 version

67ea635f · aiss · 1b2721ad · 67ea635f · 67ea635f · 67ea635f
Commit 67ea635f authored Mar 30, 2023 by aiss
20 changed files
--- a/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@@ -6,6 +6,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 #include <cmath>
+#include <iostream>
 #include "deepspeed_aio_utils.h"
@@ -113,8 +114,8 @@ void* ds_page_aligned_alloc(const size_t size, const bool lock)
    auto mlock_ret = mlock(ptr, size);
    if (mlock_ret != 0) {
        auto mlock_error = errno;
-        printf("mlock failed with %d %s\n", mlock_error, strerror(mlock_error));
+        std::cerr << "mlock failed to allocate " << size << " bytes with error no " << mlock_error
+                  << " msg " << strerror(mlock_error) << std::endl;
        free(ptr);
        return nullptr;
    }

--- a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
+/*
+Copyright 2023 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+Functionality for managing CPU tensors occupying page-locked memory.
+*/
+#include "deepspeed_pin_tensor.h"
+using namespace std;
+deepspeed_pin_tensor_t::~deepspeed_pin_tensor_t()
+{
+    for (auto iter = _locked_tensors.begin(); iter != _locked_tensors.end(); ++iter) {
+        munlock(iter->first, iter->second);
+    }
+    _locked_tensors.clear();
+}
+torch::Tensor deepspeed_pin_tensor_t::alloc(const size_t num_elem, const at::ScalarType& elem_type)
+{
+    const auto num_bytes = num_elem * elementSize(elem_type);
+    auto pinned_buffer = ds_page_aligned_alloc(num_bytes, true);
+    assert(nullptr != pinned_buffer);
+    _locked_tensors[pinned_buffer] = num_bytes;
+    auto options = torch::TensorOptions().dtype(elem_type).device(torch::kCPU);
+    return at::from_blob(pinned_buffer, static_cast<long int>(num_bytes), options);
+}
+bool deepspeed_pin_tensor_t::free(torch::Tensor& locked_tensor)
+{
+    auto addr = locked_tensor.data_ptr();
+    if (_locked_tensors.find(addr) != _locked_tensors.end()) {
+        munlock(addr, _locked_tensors[addr]);
+        _locked_tensors.erase(addr);
+        return true;
+    }
+    return false;
+}
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.h
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.h
+/*
+Copyright 2023 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+Functionality for managing CPU tensors occupying page-locked memory.
+TODO: Implement a full-featured manager that
+ 1. Avoid page-locked memory leaks
+ 2. Minimize page-locked memory usage by reducing internal fragmentation
+*/
+#include <map>
+#include "deepspeed_py_aio.h"
+struct deepspeed_pin_tensor_t {
+    std::map<void*, size_t> _locked_tensors;
+    deepspeed_pin_tensor_t() = default;
+    ~deepspeed_pin_tensor_t();
+    torch::Tensor alloc(const size_t num_elem, const at::ScalarType& elem_type);
+    bool free(torch::Tensor& locked_tensor);
+};
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -22,7 +22,8 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
      _overlap_events(overlap_events),
      _num_threads(num_threads),
      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
-      _num_pending_ops(0)
+      _num_pending_ops(0),
+      _pinned_tensor_mgr(new deepspeed_pin_tensor_t())
 {
    for (auto i = 0; i < num_threads; ++i) {
        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
@@ -280,3 +281,14 @@ int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char
 {
    return pwrite(buffer, filename, false, true);
 }
+at::Tensor deepspeed_aio_handle_t::new_cpu_locked_tensor(const size_t num_elem,
+                                                         const torch::Tensor& example_tensor)
+{
+    return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type());
+}
+bool deepspeed_aio_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor)
+{
+    return _pinned_tensor_mgr->free(locked_tensor);
+}
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -8,6 +8,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 #include <condition_variable>
 #include <memory>
 #include "deepspeed_aio_thread.h"
+#include "deepspeed_pin_tensor.h"
 struct deepspeed_aio_handle_t {
    std::unique_ptr<struct aio_context> _aio_ctxt;
@@ -19,6 +20,7 @@ struct deepspeed_aio_handle_t {
    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
    std::vector<std::thread> _threads;
    int _num_pending_ops;
+    std::unique_ptr<struct deepspeed_pin_tensor_t> _pinned_tensor_mgr;
    deepspeed_aio_handle_t(const int block_size,
                           const int queue_depth,
@@ -56,6 +58,11 @@ struct deepspeed_aio_handle_t {
    int async_pwrite(const torch::Tensor& buffer, const char* filename);
+    // TODO: Make API's args to be shape and dtype.
+    torch::Tensor new_cpu_locked_tensor(const size_t num_elem, const torch::Tensor& example_tensor);
+    bool free_cpu_locked_tensor(torch::Tensor&);
    int wait();
    void _stop_threads();

--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -37,5 +37,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
        .def("async_pread", &deepspeed_aio_handle_t::async_pread)
        .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
+        .def("new_cpu_locked_tensor", &deepspeed_aio_handle_t::new_cpu_locked_tensor)
+        .def("free_cpu_locked_tensor", &deepspeed_aio_handle_t::free_cpu_locked_tensor)
        .def("wait", &deepspeed_aio_handle_t::wait);
 }
--- a/csrc/aio/py_test/aio_bench_perf_sweep.py
+++ b/csrc/aio/py_test/aio_bench_perf_sweep.py
@@ -15,6 +15,7 @@ import shutil
 from test_ds_aio_utils import refine_integer_value
 from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
    READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR
+from deepspeed.ops.op_builder import AsyncIOBuilder
 OTHER_OPTIONS = '--handle'
 PERF_SCRIPT = 'test_ds_aio.py'
@@ -277,8 +278,6 @@ def script_path():
 def async_io_setup():
-    import deepspeed
-    from deepspeed.ops.aio import AsyncIOBuilder
    return AsyncIOBuilder().is_compatible()

--- a/csrc/aio/py_test/ds_aio_basic.py
+++ b/csrc/aio/py_test/ds_aio_basic.py
@@ -8,9 +8,10 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 import torch
 import os
 import time
-from deepspeed.ops.aio import AsyncIOBuilder
 from multiprocessing import Pool, Barrier
 from test_ds_aio_utils import report_results, task_log, task_barrier
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import AsyncIOBuilder
 def pre_basic(args, tid, read_op):
@@ -19,7 +20,10 @@ def pre_basic(args, tid, read_op):
    file = args.read_file if read_op else f'{args.write_file}.{tid}'
    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
+    buffer = get_accelerator().pin_memory(
+        torch.empty(num_bytes,
+                    dtype=torch.uint8,
+                    device='cpu'))
    task_log(
        tid,
        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'

--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -9,8 +9,9 @@ import torch
 import os
 import time
 from multiprocessing import Pool, Barrier
-from deepspeed.ops.aio import AsyncIOBuilder
 from test_ds_aio_utils import report_results, task_log, task_barrier
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import AsyncIOBuilder
 def pre_handle(args, tid, read_op):
@@ -18,23 +19,30 @@ def pre_handle(args, tid, read_op):
    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
    file = args.read_file if read_op else f'{args.write_file}.{tid}'
-    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    if args.gpu:
-        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
-    else:
-        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
-    task_log(
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
    io_parallel = args.io_parallel if args.io_parallel else 1
    handle = AsyncIOBuilder().load().aio_handle(args.block_size,
                                                args.queue_depth,
                                                args.single_submit,
                                                args.overlap_events,
                                                io_parallel)
-    task_log(tid, f'created deepspeed aio handle')
+    task_log(tid, f'Created deepspeed aio handle')
+    if args.gpu:
+        buffer = torch.empty(num_bytes,
+                             dtype=torch.uint8,
+                             device=get_accelerator().device_name())
+    else:
+        if args.use_accelerator_pin_memory:
+            buffer = get_accelerator().pin_memory(
+                torch.empty(num_bytes,
+                            dtype=torch.uint8,
+                            device='cpu'))
+        else:
+            buffer = handle.new_cpu_locked_tensor(num_bytes,
+                                                  torch.empty(0,
+                                                              dtype=torch.uint8))
+    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
    ctxt = {}
    ctxt['file'] = file
@@ -43,6 +51,11 @@ def pre_handle(args, tid, read_op):
    ctxt['buffer'] = buffer
    ctxt['elapsed_sec'] = 0
+    task_log(
+        tid,
+        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
+    )
    return ctxt

--- a/csrc/aio/py_test/parse_aio_stats.py
+++ b/csrc/aio/py_test/parse_aio_stats.py
@@ -7,7 +7,6 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 import os
 import argparse
-import re
 READ_SPEED = 'read_speed'
 WRITE_SPEED = 'write_speed'

--- a/csrc/aio/py_test/perf_sweep_utils.py
+++ b/csrc/aio/py_test/perf_sweep_utils.py
+'''Copyright The Microsoft DeepSpeed Team'''
 SCRIPT_PREFIX = '_aio_bench'
 WRITE_OP_DESC = 'write'
 READ_OP_DESC = 'read'

--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
--- a/csrc/aio/py_test/run_write_sweep.sh
+++ b/csrc/aio/py_test/run_write_sweep.sh
--- a/csrc/aio/py_test/test_ds_aio.py
+++ b/csrc/aio/py_test/test_ds_aio.py
@@ -6,11 +6,7 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 import os
-import torch
 import argparse
-import time
-import sys
-from multiprocessing import Pool
 import multiprocessing as mp
 from ds_aio_basic import aio_basic_multiprocessing
 from ds_aio_handle import aio_handle_multiprocessing
@@ -67,6 +63,10 @@ def parse_arguments():
    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
+    parser.add_argument('--use_accelerator_pin_memory',
+                        action='store_true',
+                        help='Obtain pinned (CPU page-locked) tensors from accelerator')
    args = parser.parse_args()
    print(f'args = {args}')
    return args

--- a/csrc/aio/py_test/test_ds_aio_utils.py
+++ b/csrc/aio/py_test/test_ds_aio_utils.py
@@ -5,8 +5,6 @@ Licensed under the MIT license.
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
-import os
 BYTES_PER_GB = 1024**3
 LOG_TIDS = [0]

--- a/csrc/aio/py_test/validate_async_io.py
+++ b/csrc/aio/py_test/validate_async_io.py
@@ -4,6 +4,5 @@ Licensed under the MIT license.
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
-import deepspeed
+from deepspeed.ops.op_builder import AsyncIOBuilder
-from deepspeed.ops.aio import AsyncIOBuilder
 assert AsyncIOBuilder().is_compatible()
--- a/csrc/common/custom_cuda_kernel.cu
+++ b/csrc/common/custom_cuda_kernel.cu
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
+#ifdef __HIPCC__
+#include "custom_hip_layers.h"
+#else
 #include "custom_cuda_layers.h"
+#endif
 __global__ void param_update_kernel(const float* input, __half* output, int size)
 {
    int id = blockIdx.x * blockDim.x + threadIdx.x;

--- a/csrc/includes/StopWatch.h
+++ b/csrc/includes/StopWatch.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
 #pragma once
 #ifdef _WIN32
 #include <windows.h>

--- a/csrc/includes/Timer.h
+++ b/csrc/includes/Timer.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
 #ifndef __TIMER_H__
 #define __TIMER_H__

--- a/csrc/includes/context.h
+++ b/csrc/includes/context.h
+/*
+Copyright The Microsoft DeepSpeed Team
+*/
 #pragma once
 #include <ATen/cuda/CUDAContext.h>