Commit 67ea635f authored by aiss's avatar aiss
Browse files

push dsv0.8.2 version

parent 1b2721ad
Pipeline #201 failed with stages
in 0 seconds
......@@ -6,6 +6,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <cmath>
#include <iostream>
#include "deepspeed_aio_utils.h"
......@@ -113,8 +114,8 @@ void* ds_page_aligned_alloc(const size_t size, const bool lock)
auto mlock_ret = mlock(ptr, size);
if (mlock_ret != 0) {
auto mlock_error = errno;
printf("mlock failed with %d %s\n", mlock_error, strerror(mlock_error));
std::cerr << "mlock failed to allocate " << size << " bytes with error no " << mlock_error
<< " msg " << strerror(mlock_error) << std::endl;
free(ptr);
return nullptr;
}
......
/*
Copyright 2023 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for managing CPU tensors occupying page-locked memory.
*/
#include "deepspeed_pin_tensor.h"
using namespace std;
deepspeed_pin_tensor_t::~deepspeed_pin_tensor_t()
{
for (auto iter = _locked_tensors.begin(); iter != _locked_tensors.end(); ++iter) {
munlock(iter->first, iter->second);
}
_locked_tensors.clear();
}
torch::Tensor deepspeed_pin_tensor_t::alloc(const size_t num_elem, const at::ScalarType& elem_type)
{
const auto num_bytes = num_elem * elementSize(elem_type);
auto pinned_buffer = ds_page_aligned_alloc(num_bytes, true);
assert(nullptr != pinned_buffer);
_locked_tensors[pinned_buffer] = num_bytes;
auto options = torch::TensorOptions().dtype(elem_type).device(torch::kCPU);
return at::from_blob(pinned_buffer, static_cast<long int>(num_bytes), options);
}
bool deepspeed_pin_tensor_t::free(torch::Tensor& locked_tensor)
{
auto addr = locked_tensor.data_ptr();
if (_locked_tensors.find(addr) != _locked_tensors.end()) {
munlock(addr, _locked_tensors[addr]);
_locked_tensors.erase(addr);
return true;
}
return false;
}
/*
Copyright 2023 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for managing CPU tensors occupying page-locked memory.
TODO: Implement a full-featured manager that
1. Avoid page-locked memory leaks
2. Minimize page-locked memory usage by reducing internal fragmentation
*/
#include <map>
#include "deepspeed_py_aio.h"
struct deepspeed_pin_tensor_t {
std::map<void*, size_t> _locked_tensors;
deepspeed_pin_tensor_t() = default;
~deepspeed_pin_tensor_t();
torch::Tensor alloc(const size_t num_elem, const at::ScalarType& elem_type);
bool free(torch::Tensor& locked_tensor);
};
......@@ -22,7 +22,8 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
_overlap_events(overlap_events),
_num_threads(num_threads),
_aio_config(block_size, queue_depth, single_submit, overlap_events, false),
_num_pending_ops(0)
_num_pending_ops(0),
_pinned_tensor_mgr(new deepspeed_pin_tensor_t())
{
for (auto i = 0; i < num_threads; ++i) {
_thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
......@@ -280,3 +281,14 @@ int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char
{
return pwrite(buffer, filename, false, true);
}
at::Tensor deepspeed_aio_handle_t::new_cpu_locked_tensor(const size_t num_elem,
const torch::Tensor& example_tensor)
{
return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type());
}
bool deepspeed_aio_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor)
{
return _pinned_tensor_mgr->free(locked_tensor);
}
......@@ -8,6 +8,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
#include <condition_variable>
#include <memory>
#include "deepspeed_aio_thread.h"
#include "deepspeed_pin_tensor.h"
struct deepspeed_aio_handle_t {
std::unique_ptr<struct aio_context> _aio_ctxt;
......@@ -19,6 +20,7 @@ struct deepspeed_aio_handle_t {
std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
std::vector<std::thread> _threads;
int _num_pending_ops;
std::unique_ptr<struct deepspeed_pin_tensor_t> _pinned_tensor_mgr;
deepspeed_aio_handle_t(const int block_size,
const int queue_depth,
......@@ -56,6 +58,11 @@ struct deepspeed_aio_handle_t {
int async_pwrite(const torch::Tensor& buffer, const char* filename);
// TODO: Make API's args to be shape and dtype.
torch::Tensor new_cpu_locked_tensor(const size_t num_elem, const torch::Tensor& example_tensor);
bool free_cpu_locked_tensor(torch::Tensor&);
int wait();
void _stop_threads();
......
......@@ -37,5 +37,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
.def("async_pread", &deepspeed_aio_handle_t::async_pread)
.def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
.def("new_cpu_locked_tensor", &deepspeed_aio_handle_t::new_cpu_locked_tensor)
.def("free_cpu_locked_tensor", &deepspeed_aio_handle_t::free_cpu_locked_tensor)
.def("wait", &deepspeed_aio_handle_t::wait);
}
......@@ -15,6 +15,7 @@ import shutil
from test_ds_aio_utils import refine_integer_value
from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR
from deepspeed.ops.op_builder import AsyncIOBuilder
OTHER_OPTIONS = '--handle'
PERF_SCRIPT = 'test_ds_aio.py'
......@@ -277,8 +278,6 @@ def script_path():
def async_io_setup():
import deepspeed
from deepspeed.ops.aio import AsyncIOBuilder
return AsyncIOBuilder().is_compatible()
......
......@@ -8,9 +8,10 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
import torch
import os
import time
from deepspeed.ops.aio import AsyncIOBuilder
from multiprocessing import Pool, Barrier
from test_ds_aio_utils import report_results, task_log, task_barrier
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder import AsyncIOBuilder
def pre_basic(args, tid, read_op):
......@@ -19,7 +20,10 @@ def pre_basic(args, tid, read_op):
file = args.read_file if read_op else f'{args.write_file}.{tid}'
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
buffer = get_accelerator().pin_memory(
torch.empty(num_bytes,
dtype=torch.uint8,
device='cpu'))
task_log(
tid,
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
......
......@@ -9,8 +9,9 @@ import torch
import os
import time
from multiprocessing import Pool, Barrier
from deepspeed.ops.aio import AsyncIOBuilder
from test_ds_aio_utils import report_results, task_log, task_barrier
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder import AsyncIOBuilder
def pre_handle(args, tid, read_op):
......@@ -18,23 +19,30 @@ def pre_handle(args, tid, read_op):
num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
file = args.read_file if read_op else f'{args.write_file}.{tid}'
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
if args.gpu:
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
else:
buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
task_log(
tid,
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
)
io_parallel = args.io_parallel if args.io_parallel else 1
handle = AsyncIOBuilder().load().aio_handle(args.block_size,
args.queue_depth,
args.single_submit,
args.overlap_events,
io_parallel)
task_log(tid, f'created deepspeed aio handle')
task_log(tid, f'Created deepspeed aio handle')
if args.gpu:
buffer = torch.empty(num_bytes,
dtype=torch.uint8,
device=get_accelerator().device_name())
else:
if args.use_accelerator_pin_memory:
buffer = get_accelerator().pin_memory(
torch.empty(num_bytes,
dtype=torch.uint8,
device='cpu'))
else:
buffer = handle.new_cpu_locked_tensor(num_bytes,
torch.empty(0,
dtype=torch.uint8))
task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
ctxt = {}
ctxt['file'] = file
......@@ -43,6 +51,11 @@ def pre_handle(args, tid, read_op):
ctxt['buffer'] = buffer
ctxt['elapsed_sec'] = 0
task_log(
tid,
f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
)
return ctxt
......
......@@ -7,7 +7,6 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
import os
import argparse
import re
READ_SPEED = 'read_speed'
WRITE_SPEED = 'write_speed'
......
'''Copyright The Microsoft DeepSpeed Team'''
SCRIPT_PREFIX = '_aio_bench'
WRITE_OP_DESC = 'write'
READ_OP_DESC = 'read'
......
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
......@@ -6,11 +6,7 @@ Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import os
import torch
import argparse
import time
import sys
from multiprocessing import Pool
import multiprocessing as mp
from ds_aio_basic import aio_basic_multiprocessing
from ds_aio_handle import aio_handle_multiprocessing
......@@ -67,6 +63,10 @@ def parse_arguments():
parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
parser.add_argument('--use_accelerator_pin_memory',
action='store_true',
help='Obtain pinned (CPU page-locked) tensors from accelerator')
args = parser.parse_args()
print(f'args = {args}')
return args
......
......@@ -5,8 +5,6 @@ Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import os
BYTES_PER_GB = 1024**3
LOG_TIDS = [0]
......
......@@ -4,6 +4,5 @@ Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import deepspeed
from deepspeed.ops.aio import AsyncIOBuilder
from deepspeed.ops.op_builder import AsyncIOBuilder
assert AsyncIOBuilder().is_compatible()
/*
Copyright The Microsoft DeepSpeed Team
*/
#ifdef __HIPCC__
#include "custom_hip_layers.h"
#else
#include "custom_cuda_layers.h"
#endif
__global__ void param_update_kernel(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
......
/*
Copyright The Microsoft DeepSpeed Team
*/
#pragma once
#ifdef _WIN32
#include <windows.h>
......
/*
Copyright The Microsoft DeepSpeed Team
*/
#ifndef __TIMER_H__
#define __TIMER_H__
......
/*
Copyright The Microsoft DeepSpeed Team
*/
#pragma once
#include <ATen/cuda/CUDAContext.h>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment