Commit 67ea635f authored by aiss's avatar aiss
Browse files

push dsv0.8.2 version

parent 1b2721ad
Pipeline #201 failed with stages
in 0 seconds
...@@ -6,6 +6,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. ...@@ -6,6 +6,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/ */
#include <cmath> #include <cmath>
#include <iostream>
#include "deepspeed_aio_utils.h" #include "deepspeed_aio_utils.h"
...@@ -113,8 +114,8 @@ void* ds_page_aligned_alloc(const size_t size, const bool lock) ...@@ -113,8 +114,8 @@ void* ds_page_aligned_alloc(const size_t size, const bool lock)
auto mlock_ret = mlock(ptr, size); auto mlock_ret = mlock(ptr, size);
if (mlock_ret != 0) { if (mlock_ret != 0) {
auto mlock_error = errno; auto mlock_error = errno;
printf("mlock failed with %d %s\n", mlock_error, strerror(mlock_error)); std::cerr << "mlock failed to allocate " << size << " bytes with error no " << mlock_error
<< " msg " << strerror(mlock_error) << std::endl;
free(ptr); free(ptr);
return nullptr; return nullptr;
} }
......
/*
Copyright 2023 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for managing CPU tensors occupying page-locked memory.
*/
#include "deepspeed_pin_tensor.h"
using namespace std;
deepspeed_pin_tensor_t::~deepspeed_pin_tensor_t()
{
for (auto iter = _locked_tensors.begin(); iter != _locked_tensors.end(); ++iter) {
munlock(iter->first, iter->second);
}
_locked_tensors.clear();
}
torch::Tensor deepspeed_pin_tensor_t::alloc(const size_t num_elem, const at::ScalarType& elem_type)
{
const auto num_bytes = num_elem * elementSize(elem_type);
auto pinned_buffer = ds_page_aligned_alloc(num_bytes, true);
assert(nullptr != pinned_buffer);
_locked_tensors[pinned_buffer] = num_bytes;
auto options = torch::TensorOptions().dtype(elem_type).device(torch::kCPU);
return at::from_blob(pinned_buffer, static_cast<long int>(num_bytes), options);
}
bool deepspeed_pin_tensor_t::free(torch::Tensor& locked_tensor)
{
auto addr = locked_tensor.data_ptr();
if (_locked_tensors.find(addr) != _locked_tensors.end()) {
munlock(addr, _locked_tensors[addr]);
_locked_tensors.erase(addr);
return true;
}
return false;
}
/*
Copyright 2023 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for managing CPU tensors occupying page-locked memory.
TODO: Implement a full-featured manager that
1. Avoid page-locked memory leaks
2. Minimize page-locked memory usage by reducing internal fragmentation
*/
#include <map>
#include "deepspeed_py_aio.h"
struct deepspeed_pin_tensor_t {
std::map<void*, size_t> _locked_tensors;
deepspeed_pin_tensor_t() = default;
~deepspeed_pin_tensor_t();
torch::Tensor alloc(const size_t num_elem, const at::ScalarType& elem_type);
bool free(torch::Tensor& locked_tensor);
};
...@@ -22,7 +22,8 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size, ...@@ -22,7 +22,8 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
_overlap_events(overlap_events), _overlap_events(overlap_events),
_num_threads(num_threads), _num_threads(num_threads),
_aio_config(block_size, queue_depth, single_submit, overlap_events, false), _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
_num_pending_ops(0) _num_pending_ops(0),
_pinned_tensor_mgr(new deepspeed_pin_tensor_t())
{ {
for (auto i = 0; i < num_threads; ++i) { for (auto i = 0; i < num_threads; ++i) {
_thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config)); _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
...@@ -280,3 +281,14 @@ int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char ...@@ -280,3 +281,14 @@ int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char
{ {
return pwrite(buffer, filename, false, true); return pwrite(buffer, filename, false, true);
} }
at::Tensor deepspeed_aio_handle_t::new_cpu_locked_tensor(const size_t num_elem,
const torch::Tensor& example_tensor)
{
return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type());
}
bool deepspeed_aio_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor)
{
return _pinned_tensor_mgr->free(locked_tensor);
}
...@@ -8,6 +8,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices. ...@@ -8,6 +8,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
#include <condition_variable> #include <condition_variable>
#include <memory> #include <memory>
#include "deepspeed_aio_thread.h" #include "deepspeed_aio_thread.h"
#include "deepspeed_pin_tensor.h"
struct deepspeed_aio_handle_t { struct deepspeed_aio_handle_t {
std::unique_ptr<struct aio_context> _aio_ctxt; std::unique_ptr<struct aio_context> _aio_ctxt;
...@@ -19,6 +20,7 @@ struct deepspeed_aio_handle_t { ...@@ -19,6 +20,7 @@ struct deepspeed_aio_handle_t {
std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts; std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
std::vector<std::thread> _threads; std::vector<std::thread> _threads;
int _num_pending_ops; int _num_pending_ops;
std::unique_ptr<struct deepspeed_pin_tensor_t> _pinned_tensor_mgr;
deepspeed_aio_handle_t(const int block_size, deepspeed_aio_handle_t(const int block_size,
const int queue_depth, const int queue_depth,
...@@ -56,6 +58,11 @@ struct deepspeed_aio_handle_t { ...@@ -56,6 +58,11 @@ struct deepspeed_aio_handle_t {
int async_pwrite(const torch::Tensor& buffer, const char* filename); int async_pwrite(const torch::Tensor& buffer, const char* filename);
// TODO: Make API's args to be shape and dtype.
torch::Tensor new_cpu_locked_tensor(const size_t num_elem, const torch::Tensor& example_tensor);
bool free_cpu_locked_tensor(torch::Tensor&);
int wait(); int wait();
void _stop_threads(); void _stop_threads();
......
...@@ -37,5 +37,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) ...@@ -37,5 +37,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
.def("async_pread", &deepspeed_aio_handle_t::async_pread) .def("async_pread", &deepspeed_aio_handle_t::async_pread)
.def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite) .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
.def("new_cpu_locked_tensor", &deepspeed_aio_handle_t::new_cpu_locked_tensor)
.def("free_cpu_locked_tensor", &deepspeed_aio_handle_t::free_cpu_locked_tensor)
.def("wait", &deepspeed_aio_handle_t::wait); .def("wait", &deepspeed_aio_handle_t::wait);
} }
...@@ -15,6 +15,7 @@ import shutil ...@@ -15,6 +15,7 @@ import shutil
from test_ds_aio_utils import refine_integer_value from test_ds_aio_utils import refine_integer_value
from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \ from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR
from deepspeed.ops.op_builder import AsyncIOBuilder
OTHER_OPTIONS = '--handle' OTHER_OPTIONS = '--handle'
PERF_SCRIPT = 'test_ds_aio.py' PERF_SCRIPT = 'test_ds_aio.py'
...@@ -277,8 +278,6 @@ def script_path(): ...@@ -277,8 +278,6 @@ def script_path():
def async_io_setup(): def async_io_setup():
import deepspeed
from deepspeed.ops.aio import AsyncIOBuilder
return AsyncIOBuilder().is_compatible() return AsyncIOBuilder().is_compatible()
......
...@@ -8,9 +8,10 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices. ...@@ -8,9 +8,10 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
import torch import torch
import os import os
import time import time
from deepspeed.ops.aio import AsyncIOBuilder
from multiprocessing import Pool, Barrier from multiprocessing import Pool, Barrier
from test_ds_aio_utils import report_results, task_log, task_barrier from test_ds_aio_utils import report_results, task_log, task_barrier
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder import AsyncIOBuilder
def pre_basic(args, tid, read_op): def pre_basic(args, tid, read_op):
...@@ -19,7 +20,10 @@ def pre_basic(args, tid, read_op): ...@@ -19,7 +20,10 @@ def pre_basic(args, tid, read_op):
file = args.read_file if read_op else f'{args.write_file}.{tid}' file = args.read_file if read_op else f'{args.write_file}.{tid}'
task_log(tid, f'Allocate tensor of size {num_bytes} bytes') task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory() buffer = get_accelerator().pin_memory(
torch.empty(num_bytes,
dtype=torch.uint8,
device='cpu'))
task_log( task_log(
tid, tid,
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}' f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
......
...@@ -9,8 +9,9 @@ import torch ...@@ -9,8 +9,9 @@ import torch
import os import os
import time import time
from multiprocessing import Pool, Barrier from multiprocessing import Pool, Barrier
from deepspeed.ops.aio import AsyncIOBuilder
from test_ds_aio_utils import report_results, task_log, task_barrier from test_ds_aio_utils import report_results, task_log, task_barrier
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder import AsyncIOBuilder
def pre_handle(args, tid, read_op): def pre_handle(args, tid, read_op):
...@@ -18,23 +19,30 @@ def pre_handle(args, tid, read_op): ...@@ -18,23 +19,30 @@ def pre_handle(args, tid, read_op):
num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
file = args.read_file if read_op else f'{args.write_file}.{tid}' file = args.read_file if read_op else f'{args.write_file}.{tid}'
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
if args.gpu:
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
else:
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
task_log(
tid,
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
)
io_parallel = args.io_parallel if args.io_parallel else 1 io_parallel = args.io_parallel if args.io_parallel else 1
handle = AsyncIOBuilder().load().aio_handle(args.block_size, handle = AsyncIOBuilder().load().aio_handle(args.block_size,
args.queue_depth, args.queue_depth,
args.single_submit, args.single_submit,
args.overlap_events, args.overlap_events,
io_parallel) io_parallel)
task_log(tid, f'created deepspeed aio handle') task_log(tid, f'Created deepspeed aio handle')
if args.gpu:
buffer = torch.empty(num_bytes,
dtype=torch.uint8,
device=get_accelerator().device_name())
else:
if args.use_accelerator_pin_memory:
buffer = get_accelerator().pin_memory(
torch.empty(num_bytes,
dtype=torch.uint8,
device='cpu'))
else:
buffer = handle.new_cpu_locked_tensor(num_bytes,
torch.empty(0,
dtype=torch.uint8))
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
ctxt = {} ctxt = {}
ctxt['file'] = file ctxt['file'] = file
...@@ -43,6 +51,11 @@ def pre_handle(args, tid, read_op): ...@@ -43,6 +51,11 @@ def pre_handle(args, tid, read_op):
ctxt['buffer'] = buffer ctxt['buffer'] = buffer
ctxt['elapsed_sec'] = 0 ctxt['elapsed_sec'] = 0
task_log(
tid,
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
)
return ctxt return ctxt
......
...@@ -7,7 +7,6 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices. ...@@ -7,7 +7,6 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
import os import os
import argparse import argparse
import re
READ_SPEED = 'read_speed' READ_SPEED = 'read_speed'
WRITE_SPEED = 'write_speed' WRITE_SPEED = 'write_speed'
......
'''Copyright The Microsoft DeepSpeed Team'''
SCRIPT_PREFIX = '_aio_bench' SCRIPT_PREFIX = '_aio_bench'
WRITE_OP_DESC = 'write' WRITE_OP_DESC = 'write'
READ_OP_DESC = 'read' READ_OP_DESC = 'read'
......
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
...@@ -6,11 +6,7 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices. ...@@ -6,11 +6,7 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
""" """
import os import os
import torch
import argparse import argparse
import time
import sys
from multiprocessing import Pool
import multiprocessing as mp import multiprocessing as mp
from ds_aio_basic import aio_basic_multiprocessing from ds_aio_basic import aio_basic_multiprocessing
from ds_aio_handle import aio_handle_multiprocessing from ds_aio_handle import aio_handle_multiprocessing
...@@ -67,6 +63,10 @@ def parse_arguments(): ...@@ -67,6 +63,10 @@ def parse_arguments():
parser.add_argument('--gpu', action='store_true', help='Use GPU memory') parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
parser.add_argument('--use_accelerator_pin_memory',
action='store_true',
help='Obtain pinned (CPU page-locked) tensors from accelerator')
args = parser.parse_args() args = parser.parse_args()
print(f'args = {args}') print(f'args = {args}')
return args return args
......
...@@ -5,8 +5,6 @@ Licensed under the MIT license. ...@@ -5,8 +5,6 @@ Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices. Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
""" """
import os
BYTES_PER_GB = 1024**3 BYTES_PER_GB = 1024**3
LOG_TIDS = [0] LOG_TIDS = [0]
......
...@@ -4,6 +4,5 @@ Licensed under the MIT license. ...@@ -4,6 +4,5 @@ Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices. Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
""" """
import deepspeed from deepspeed.ops.op_builder import AsyncIOBuilder
from deepspeed.ops.aio import AsyncIOBuilder
assert AsyncIOBuilder().is_compatible() assert AsyncIOBuilder().is_compatible()
/*
Copyright The Microsoft DeepSpeed Team
*/
#ifdef __HIPCC__
#include "custom_hip_layers.h"
#else
#include "custom_cuda_layers.h" #include "custom_cuda_layers.h"
#endif
__global__ void param_update_kernel(const float* input, __half* output, int size) __global__ void param_update_kernel(const float* input, __half* output, int size)
{ {
int id = blockIdx.x * blockDim.x + threadIdx.x; int id = blockIdx.x * blockDim.x + threadIdx.x;
......
/*
Copyright The Microsoft DeepSpeed Team
*/
#pragma once #pragma once
#ifdef _WIN32 #ifdef _WIN32
#include <windows.h> #include <windows.h>
......
/*
Copyright The Microsoft DeepSpeed Team
*/
#ifndef __TIMER_H__ #ifndef __TIMER_H__
#define __TIMER_H__ #define __TIMER_H__
......
/*
Copyright The Microsoft DeepSpeed Team
*/
#pragma once #pragma once
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment