"torchvision/models/vscode:/vscode.git/clone" did not exist on "a68db4fab7f65e5f49459d01e6250ec60bbca8b6"
Commit 7d1a83a9 authored by aiss's avatar aiss
Browse files

push Deepspeed 0.6.3 rocm version

parent ab5534fc
#!/usr/bin/env python
#!/usr/bin/env python3
import argparse
import json
......
#!/usr/bin/env python
#!/usr/bin/env python3
from deepspeed.env_report import main
from deepspeed.env_report import cli_main
if __name__ == '__main__':
main()
cli_main()
......@@ -10,11 +10,25 @@ fi
hostfile=/job/hostfile
while getopts "h?f:" opt; do
case "$opt" in
h|\?)
echo "-f <hostfile>: specify a hostfile, defaults to /job/hostfile"
exit 0
;;
f)
hostfile=$OPTARG
shift $((OPTIND-1))
;;
esac
done
echo "hostfile=$hostfile"
if [ -f $hostfile ]; then
hosts=`cat $hostfile | awk '{print $1}' | paste -sd "," -`
export PDSH_RCMD_TYPE=ssh
pdsh -w ${hosts} $@
else
echo "Missing hostfile at ${hostfile}, executing command locally"
$@
echo "Missing hostfile at ${hostfile}, unable to proceed"
fi
#include "cpu_adagrad.h"
#include <cuda_runtime_api.h>
#include <math.h>
#include <omp.h>
#include <torch/extension.h>
#include <iostream>
#include <memory>
#include <type_traits>
#include <unordered_map>
#include "cublas_v2.h"
#include "cuda.h"
#include "curand.h"
#include "custom_cuda_layers.h"
static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
// C++ interface
void Adagrad_Optimizer::Step_1(float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<1>(
&rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
#endif
if (_param_size > rounded_size) {
float step_size = -1 * _alpha;
__half* grads_cast_h;
__half* params_cast_h;
if (half_precision) {
grads_cast_h = reinterpret_cast<__half*>(grads);
params_cast_h = reinterpret_cast<__half*>(_params);
}
for (size_t t = rounded_size; t < _param_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > _param_size) copy_size = _param_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t k = t; k < offset; k++) {
float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
float param = half_precision ? (float)params_cast_h[k] : _params[k];
float momentum = grads[k];
float variance = _exp_avg_sq[k];
if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
variance += grad * grad;
grad = sqrt(variance);
grad += _eps;
grad = momentum / grad;
param = grad * step_size + param;
if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
if (half_precision)
params_cast_h[k] = (__half)param;
else
_params[k] = param;
// STORE UPDATE TERM TO GRAD'S MEMORY
grads[k] = grad * step_size;
_exp_avg_sq[k] = variance;
}
if (dev_params) {
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
}
}
void Adagrad_Optimizer::Step_4(float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<4>(
&rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
#endif
if (_param_size > rounded_size)
Step_1((_params + rounded_size),
(grads + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int create_adagrad_optimizer(int optimizer_id,
float alpha = 1e-2,
float eps = 1e-8,
float weight_decay = 0,
bool should_log = false)
{
auto opt = std::make_shared<Adagrad_Optimizer>(alpha, eps, weight_decay);
s_optimizers[optimizer_id] = opt;
if (should_log) {
std::string avx_type = "";
#if defined(__AVX512__)
avx_type = "AVX512";
#else
#if defined(__AVX256__)
avx_type = "AVX2";
#else
avx_type = "scalar";
#endif
#endif
printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n",
optimizer_id,
avx_type.c_str());
printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay);
}
return 0;
}
void Adagrad_Optimizer::Step_8(float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<8>(
&rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
#endif
if (_param_size > rounded_size)
Step_4((_params + rounded_size),
(grads + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int ds_adagrad_step(int optimizer_id,
size_t step,
float lr,
float epsilon,
float weight_decay,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg_sq)
{
auto params_c = params.contiguous();
auto grads_c = grads.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adagrad_Optimizer> opt =
std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step);
opt->update_state(lr, epsilon, weight_decay);
opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
opt->SynchronizeStreams();
return 0;
}
int ds_adagrad_step_plus_copy(int optimizer_id,
size_t step,
float lr,
float epsilon,
float weight_decay,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg_sq,
torch::Tensor& gpu_params)
{
auto params_c = params.contiguous();
auto gpu_params_c = gpu_params.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
auto grads_c = grads.contiguous();
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
__half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adagrad_Optimizer> opt =
std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step);
opt->update_state(lr, epsilon, weight_decay);
opt->Step_8(params_ptr,
grads_ptr,
exp_avg_sq_ptr,
params_c.size(0),
gpu_params_ptr,
(params.options().dtype() == at::kHalf));
opt->SynchronizeStreams();
return 0;
}
int destroy_adagrad_optimizer(int optimizer_id)
{
s_optimizers.erase(optimizer_id);
return 0;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
m.def("adagrad_update_copy",
&ds_adagrad_step_plus_copy,
"DeepSpeed CPU Adagrad update and param copy (C++)");
m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
}
// !!! This is a file automatically generated by hipify!!!
#include "cpu_adagrad_hip.h"
#include <hip/hip_runtime_api.h>
#include <math.h>
#include <omp.h>
#include <torch/extension.h>
#include <iostream>
#include <memory>
#include <type_traits>
#include <unordered_map>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand/hiprand.h"
#include "custom_hip_layers.h"
static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
// C++ interface
void Adagrad_Optimizer::Step_1(float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<1>(
&rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
#endif
if (_param_size > rounded_size) {
float step_size = -1 * _alpha;
__half* grads_cast_h;
__half* params_cast_h;
if (half_precision) {
grads_cast_h = reinterpret_cast<__half*>(grads);
params_cast_h = reinterpret_cast<__half*>(_params);
}
for (size_t t = rounded_size; t < _param_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > _param_size) copy_size = _param_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t k = t; k < offset; k++) {
float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
float param = half_precision ? (float)params_cast_h[k] : _params[k];
float momentum = grads[k];
float variance = _exp_avg_sq[k];
if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
variance += grad * grad;
grad = sqrt(variance);
grad += _eps;
grad = momentum / grad;
param = grad * step_size + param;
if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
if (half_precision)
params_cast_h[k] = (__half)param;
else
_params[k] = param;
// STORE UPDATE TERM TO GRAD'S MEMORY
grads[k] = grad * step_size;
_exp_avg_sq[k] = variance;
}
if (dev_params) {
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
}
}
void Adagrad_Optimizer::Step_4(float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<4>(
&rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
#endif
if (_param_size > rounded_size)
Step_1((_params + rounded_size),
(grads + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int create_adagrad_optimizer(int optimizer_id,
float alpha = 1e-2,
float eps = 1e-8,
float weight_decay = 0,
bool should_log = false)
{
auto opt = std::make_shared<Adagrad_Optimizer>(alpha, eps, weight_decay);
s_optimizers[optimizer_id] = opt;
if (should_log) {
std::string avx_type = "";
#if defined(__AVX512__)
avx_type = "AVX512";
#else
#if defined(__AVX256__)
avx_type = "AVX2";
#else
avx_type = "scalar";
#endif
#endif
printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n",
optimizer_id,
avx_type.c_str());
printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay);
}
return 0;
}
void Adagrad_Optimizer::Step_8(float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<8>(
&rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
#endif
if (_param_size > rounded_size)
Step_4((_params + rounded_size),
(grads + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int ds_adagrad_step(int optimizer_id,
size_t step,
float lr,
float epsilon,
float weight_decay,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg_sq)
{
auto params_c = params.contiguous();
auto grads_c = grads.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adagrad_Optimizer> opt =
std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step);
opt->update_state(lr, epsilon, weight_decay);
opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
opt->SynchronizeStreams();
return 0;
}
int ds_adagrad_step_plus_copy(int optimizer_id,
size_t step,
float lr,
float epsilon,
float weight_decay,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg_sq,
torch::Tensor& gpu_params)
{
auto params_c = params.contiguous();
auto gpu_params_c = gpu_params.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
auto grads_c = grads.contiguous();
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
__half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adagrad_Optimizer> opt =
std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step);
opt->update_state(lr, epsilon, weight_decay);
opt->Step_8(params_ptr,
grads_ptr,
exp_avg_sq_ptr,
params_c.size(0),
gpu_params_ptr,
(params.options().dtype() == at::kHalf));
opt->SynchronizeStreams();
return 0;
}
int destroy_adagrad_optimizer(int optimizer_id)
{
s_optimizers.erase(optimizer_id);
return 0;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
m.def("adagrad_update_copy",
&ds_adagrad_step_plus_copy,
"DeepSpeed CPU Adagrad update and param copy (C++)");
m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
}
This diff is collapsed.
// !!! This is a file automatically generated by hipify!!!
#include "cpu_adam_hip.h"
#include <hip/hip_runtime_api.h>
#include <math.h>
#include <omp.h>
#include <torch/extension.h>
#include <iostream>
#include <memory>
#include <type_traits>
#include <unordered_map>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand/hiprand.h"
#include "custom_hip_layers.h"
static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
// C++ interface
void Adam_Optimizer::Step_1(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<1>(&rounded_size,
_params,
grads,
_exp_avg,
_exp_avg_sq,
_param_size,
dev_params,
half_precision);
#endif
if (_param_size > rounded_size) {
float betta1_minus1 = 1 - _betta1;
float betta2_minus1 = 1 - _betta2;
float step_size = -1 * _alpha / _bias_correction1;
float w_decay = -1 * _alpha * _weight_decay;
__half* grads_cast_h;
__half* params_cast_h;
if (half_precision) {
grads_cast_h = reinterpret_cast<__half*>(grads);
params_cast_h = reinterpret_cast<__half*>(_params);
}
for (size_t t = rounded_size; t < _param_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > _param_size) copy_size = _param_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t k = t; k < offset; k++) {
float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
float param = half_precision ? (float)params_cast_h[k] : _params[k];
float momentum = _exp_avg[k];
float variance = _exp_avg_sq[k];
if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
momentum = momentum * _betta1;
momentum = grad * betta1_minus1 + momentum;
variance = variance * _betta2;
grad = grad * grad;
variance = grad * betta2_minus1 + variance;
grad = sqrt(variance);
grad = grad * _bias_correction2 + _eps;
grad = momentum / grad;
if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
param = grad * step_size + param;
if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
if (half_precision)
params_cast_h[k] = (__half)param;
else
_params[k] = param;
_exp_avg[k] = momentum;
_exp_avg_sq[k] = variance;
}
if (dev_params) {
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
}
}
void Adam_Optimizer::Step_4(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<4>(&rounded_size,
_params,
grads,
_exp_avg,
_exp_avg_sq,
_param_size,
dev_params,
half_precision);
#endif
if (_param_size > rounded_size)
Step_1((_params + rounded_size),
(grads + rounded_size),
(_exp_avg + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int create_adam_optimizer(int optimizer_id,
float alpha = 1e-3,
float betta1 = 0.9,
float betta2 = 0.999,
float eps = 1e-8,
float weight_decay = 0,
bool adamw_mode = true,
bool should_log = false)
{
auto opt =
std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
s_optimizers[optimizer_id] = opt;
if (should_log) {
std::string avx_type = "";
#if defined(__AVX512__)
avx_type = "AVX512";
#else
#if defined(__AVX256__)
avx_type = "AVX2";
#else
avx_type = "scalar";
#endif
#endif
printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
optimizer_id,
avx_type.c_str());
printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
alpha,
betta1,
betta2,
weight_decay,
(int)adamw_mode);
}
return 0;
}
void Adam_Optimizer::Step_8(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<8>(&rounded_size,
_params,
grads,
_exp_avg,
_exp_avg_sq,
_param_size,
dev_params,
half_precision);
#endif
if (_param_size > rounded_size)
Step_4((_params + rounded_size),
(grads + rounded_size),
(_exp_avg + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int ds_adam_step(int optimizer_id,
size_t step,
float lr,
float beta1,
float beta2,
float epsilon,
float weight_decay,
bool bias_correction,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg,
torch::Tensor& exp_avg_sq)
{
auto params_c = params.contiguous();
auto grads_c = grads.contiguous();
auto exp_avg_c = exp_avg.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
// assert(params.options().dtype() == grads.options().dtype());
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adam_Optimizer> opt =
std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step, beta1, beta2);
opt->update_state(lr, epsilon, weight_decay, bias_correction);
opt->Step_8(params_ptr,
grads_ptr,
exp_avg_ptr,
exp_avg_sq_ptr,
params_c.size(0),
nullptr,
(params.options().dtype() == at::kHalf));
opt->SynchronizeStreams();
return 0;
}
int ds_adam_step_plus_copy(int optimizer_id,
size_t step,
float lr,
float beta1,
float beta2,
float epsilon,
float weight_decay,
bool bias_correction,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg,
torch::Tensor& exp_avg_sq,
torch::Tensor& gpu_params)
{
auto params_c = params.contiguous();
auto gpu_params_c = gpu_params.contiguous();
auto exp_avg_c = exp_avg.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
auto grads_c = grads.contiguous();
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
__half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adam_Optimizer> opt =
std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step, beta1, beta2);
opt->update_state(lr, epsilon, weight_decay, bias_correction);
opt->Step_8(params_ptr,
grads_ptr,
exp_avg_ptr,
exp_avg_sq_ptr,
params_c.size(0),
gpu_params_ptr,
(params.options().dtype() == at::kHalf));
opt->SynchronizeStreams();
return 0;
}
int destroy_adam_optimizer(int optimizer_id)
{
s_optimizers.erase(optimizer_id);
return 0;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
m.def("adam_update_copy",
&ds_adam_step_plus_copy,
"DeepSpeed CPU Adam update and param copy (C++)");
m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
}
// !!! This is a file automatically generated by hipify!!!
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/Exceptions.h>
// Another possibility:
// #include <torch/all.h>
#include <assert.h>
#include "multi_tensor_apply_hip.cuh"
#include "type_shim_hip.h"
#define BLOCK_SIZE 512
#define ILP 4
typedef enum {
ADAM_MODE_0 = 0, // L2 regularization mode
ADAM_MODE_1 = 1 // Decoupled weight decay mode(AdamW)
} adamMode_t;
using MATH_T = float;
template <typename T>
struct AdamFunctor {
__device__ __forceinline__ void operator()(int chunk_size,
volatile int* noop_gmem,
TensorListMetadata<4>& tl,
const float beta1,
const float beta2,
const float beta1_correction,
const float beta2_correction,
const float epsilon,
const float lr,
adamMode_t mode,
const float decay)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
int tensor_loc = tl.block_to_tensor[blockIdx.x];
// potentially use to pass in list of scalar
// int tensor_num = tl.start_tensor_this_launch + tensor_loc;
int chunk_idx = tl.block_to_chunk[blockIdx.x];
int n = tl.sizes[tensor_loc];
T* g = (T*)tl.addresses[0][tensor_loc];
g += chunk_idx * chunk_size;
T* p = (T*)tl.addresses[1][tensor_loc];
p += chunk_idx * chunk_size;
T* m = (T*)tl.addresses[2][tensor_loc];
m += chunk_idx * chunk_size;
T* v = (T*)tl.addresses[3][tensor_loc];
v += chunk_idx * chunk_size;
n -= chunk_idx * chunk_size;
// see note in multi_tensor_scale_kernel.cu
for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
MATH_T r_g[ILP];
MATH_T r_p[ILP];
MATH_T r_m[ILP];
MATH_T r_v[ILP];
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
int i = i_start + threadIdx.x + ii * blockDim.x;
if (i < n && i < chunk_size) {
r_g[ii] = g[i];
r_p[ii] = p[i];
r_m[ii] = m[i];
r_v[ii] = v[i];
} else {
r_g[ii] = MATH_T(0);
r_p[ii] = MATH_T(0);
r_m[ii] = MATH_T(0);
r_v[ii] = MATH_T(0);
}
}
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
if (mode == ADAM_MODE_0) { // L2
r_g[ii] = r_g[ii] + (decay * r_p[ii]);
r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
MATH_T update = next_m_unbiased / denom;
r_p[ii] = r_p[ii] - (lr * update);
} else { // weight decay
r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
r_p[ii] = r_p[ii] - (lr * update);
}
}
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
int i = i_start + threadIdx.x + ii * blockDim.x;
if (i < n && i < chunk_size) {
p[i] = r_p[ii];
m[i] = r_m[ii];
v[i] = r_v[ii];
}
}
}
}
};
void multi_tensor_adam_cuda(int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists,
const float lr,
const float beta1,
const float beta2,
const float epsilon,
const int step,
const int mode,
const int bias_correction,
const float weight_decay)
{
using namespace at;
// Handle bias correction mode
float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
if (bias_correction == 1) {
bias_correction1 = 1 - ::pow(beta1, step);
bias_correction2 = 1 - ::pow(beta2, step);
}
// Assume single type across p,g,m1,m2 now
DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
0,
"adam",
multi_tensor_apply<4>(BLOCK_SIZE,
chunk_size,
noop_flag,
tensor_lists,
AdamFunctor<scalar_t_0>(),
beta1,
beta2,
bias_correction1,
bias_correction2,
epsilon,
lr,
(adamMode_t)mode,
weight_decay);)
AT_CUDA_CHECK(hipGetLastError());
}
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/Exceptions.h>
#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
#include "compat.h"
#include <assert.h>
// #include <iostream>
// This header is the one-stop shop for all your multi-tensor apply needs.
// TODO: Kernel arg size limit may be <4KB for some other cards (ie Jetson)
constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
template <int n>
struct TensorListMetadata {
void* addresses[n][depth_to_max_tensors[n - 1]];
int sizes[depth_to_max_tensors[n - 1]];
unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a full int.
int start_tensor_this_launch;
};
template <typename T, typename U, typename... ArgTypes>
__global__ void multi_tensor_apply_kernel(int chunk_size,
volatile int* noop_flag,
T tl,
U callable,
ArgTypes... args)
{
// Hand the chunk information to the user-supplied functor to process however it likes.
callable(chunk_size, noop_flag, tl, args...);
}
template <int depth, typename T, typename... ArgTypes>
void multi_tensor_apply(int block_size,
int chunk_size,
const at::Tensor& noop_flag,
const std::vector<std::vector<at::Tensor>>& tensor_lists,
T callable,
ArgTypes... args)
{
TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
int len0 = tensor_lists[0].size();
TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
auto ref_device = tensor_lists[0][0].device();
TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
{
TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
for (int t = 0; t < tensor_lists[l].size(); t++) {
// TODO: Print which tensor fails.
bool contiguous_memory = tensor_lists[l][t].is_contiguous();
#ifdef VERSION_GE_1_5
contiguous_memory = (contiguous_memory ||
tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
#endif
TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
"A tensor was not on the same device as the first tensor");
TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
}
}
int ntensors = tensor_lists[0].size();
TensorListMetadata<depth> tl;
const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(tensor_lists[0][0]));
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
tl.start_tensor_this_launch = 0;
int loc_block_info = 0;
int loc_tensor_info = 0;
for (int t = 0; t < ntensors; t++) {
tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
for (int d = 0; d < depth; d++)
tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
loc_tensor_info++;
int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
// std::cout << chunks_this_tensor << std::endl;
tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
tl.block_to_chunk[loc_block_info] = chunk;
loc_block_info++;
bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
chunk == chunks_this_tensor - 1);
bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
if (tensors_full || blocks_full || last_chunk) {
// using accscalar_t = acc_type<scalar_t, true>;
hipLaunchKernelGGL(( multi_tensor_apply_kernel), dim3(loc_block_info), dim3(block_size), 0, stream,
chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
AT_CUDA_CHECK(hipGetLastError());
// Reset. The control flow possibilities here make my brain hurt.
loc_block_info = 0;
if (chunk == chunks_this_tensor - 1) {
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
// std::endl;
loc_tensor_info = 0;
tl.start_tensor_this_launch = t + 1;
} else {
// std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
// std::endl;
tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
for (int d = 0; d < depth; d++)
tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
loc_tensor_info = 1;
tl.start_tensor_this_launch = t;
}
}
}
}
}
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <libaio.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstring>
#include <fstream>
#include <iostream>
#include <limits>
#include <numeric>
#include <string>
#include <vector>
#include "deepspeed_aio_common.h"
using namespace std;
using namespace std::chrono;
#define DEBUG_DS_AIO_PERF 0
#define DEBUG_DS_AIO_SUBMIT_PERF 0
static const std::string c_library_name = "deepspeed_aio";
static void _report_aio_statistics(const char* tag,
const std::vector<std::chrono::duration<double>>& latencies)
__attribute__((unused));
static void _report_aio_statistics(const char* tag,
const std::vector<std::chrono::duration<double>>& latencies)
{
std::vector<double> lat_usec;
for (auto& lat : latencies) { lat_usec.push_back(lat.count() * 1e6); }
const auto min_lat = *(std::min_element(lat_usec.begin(), lat_usec.end()));
const auto max_lat = *(std::max_element(lat_usec.begin(), lat_usec.end()));
const auto avg_lat = std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
std::cout << c_library_name << ": latency statistics(usec) " << tag
<< " min/max/avg = " << min_lat << " " << max_lat << " " << avg_lat << std::endl;
}
static void _get_aio_latencies(std::vector<std::chrono::duration<double>>& raw_latencies,
struct deepspeed_aio_latency_t& summary_latencies)
{
std::vector<double> lat_usec;
for (auto& lat : raw_latencies) { lat_usec.push_back(lat.count() * 1e6); }
summary_latencies._min_usec = *(std::min_element(lat_usec.begin(), lat_usec.end()));
summary_latencies._max_usec = *(std::max_element(lat_usec.begin(), lat_usec.end()));
summary_latencies._avg_usec =
std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
}
static void _do_io_submit_singles(const long long int n_iocbs,
const long long int iocb_index,
std::unique_ptr<aio_context>& aio_ctxt,
std::vector<std::chrono::duration<double>>& submit_times)
{
for (auto i = 0; i < n_iocbs; ++i) {
const auto st = std::chrono::high_resolution_clock::now();
const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, 1, aio_ctxt->_iocbs.data() + i);
submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
#if DEBUG_DS_AIO_SUBMIT_PERF
printf("submit(usec) %f io_index=%lld buf=%p len=%lu off=%llu \n",
submit_times.back().count() * 1e6,
iocb_index,
aio_ctxt->_iocbs[i]->u.c.buf,
aio_ctxt->_iocbs[i]->u.c.nbytes,
aio_ctxt->_iocbs[i]->u.c.offset);
#endif
assert(submit_ret > 0);
}
}
static void _do_io_submit_block(const long long int n_iocbs,
const long long int iocb_index,
std::unique_ptr<aio_context>& aio_ctxt,
std::vector<std::chrono::duration<double>>& submit_times)
{
const auto st = std::chrono::high_resolution_clock::now();
const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, n_iocbs, aio_ctxt->_iocbs.data());
submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
#if DEBUG_DS_AIO_SUBMIT_PERF
printf("submit(usec) %f io_index=%lld nr=%lld buf=%p len=%lu off=%llu \n",
submit_times.back().count() * 1e6,
iocb_index,
n_iocbs,
aio_ctxt->_iocbs[0]->u.c.buf,
aio_ctxt->_iocbs[0]->u.c.nbytes,
aio_ctxt->_iocbs[0]->u.c.offset);
#endif
assert(submit_ret > 0);
}
static int _do_io_complete(const long long int min_completes,
const long long int max_completes,
std::unique_ptr<aio_context>& aio_ctxt,
std::vector<std::chrono::duration<double>>& reap_times)
{
const auto start_time = std::chrono::high_resolution_clock::now();
const auto n_completes = io_getevents(
aio_ctxt->_io_ctxt, min_completes, max_completes, aio_ctxt->_io_events.data(), nullptr);
reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
assert(n_completes >= min_completes);
return n_completes;
}
void do_aio_operation_sequential(const bool read_op,
std::unique_ptr<aio_context>& aio_ctxt,
std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
deepspeed_aio_config_t* config,
deepspeed_aio_perf_t* perf)
{
struct io_prep_context prep_ctxt(read_op, xfer_ctxt, aio_ctxt->_block_size, &aio_ctxt->_iocbs);
const auto num_io_blocks = static_cast<long long int>(
ceil(static_cast<double>(xfer_ctxt->_num_bytes) / aio_ctxt->_block_size));
#if DEBUG_DS_AIO_PERF
const auto io_op_name = std::string(read_op ? "read" : "write");
std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
<< " bytes with " << num_io_blocks << " io blocks" << std::endl;
#endif
std::vector<std::chrono::duration<double>> submit_times;
std::vector<std::chrono::duration<double>> reap_times;
const auto max_queue_bytes =
static_cast<long long int>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
auto start = std::chrono::high_resolution_clock::now();
for (long long iocb_index = 0; iocb_index < num_io_blocks;
iocb_index += aio_ctxt->_queue_depth) {
const auto start_offset = iocb_index * aio_ctxt->_block_size;
const auto start_buffer = (char*)xfer_ctxt->_mem_buffer + start_offset;
const auto n_iocbs =
min(static_cast<long long>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
const auto num_bytes = min(max_queue_bytes, (xfer_ctxt->_num_bytes - start_offset));
prep_ctxt.prep_iocbs(n_iocbs, num_bytes, start_buffer, start_offset);
if (config->_single_submit) {
_do_io_submit_singles(n_iocbs, iocb_index, aio_ctxt, submit_times);
} else {
_do_io_submit_block(n_iocbs, iocb_index, aio_ctxt, submit_times);
}
_do_io_complete(n_iocbs, n_iocbs, aio_ctxt, reap_times);
}
const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
if (perf) {
_get_aio_latencies(submit_times, perf->_submit);
_get_aio_latencies(reap_times, perf->_complete);
perf->_e2e_usec = elapsed.count() * 1e6;
perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
}
#if DEBUG_DS_AIO_PERF
_report_aio_statistics("submit", submit_times);
_report_aio_statistics("complete", reap_times);
#endif
#if DEBUG_DS_AIO_PERF
std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
<< " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
#endif
#if DEBUG_DS_AIO_PERF
std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
<< " bytes " << std::endl;
#endif
}
void do_aio_operation_overlap(const bool read_op,
std::unique_ptr<aio_context>& aio_ctxt,
std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
deepspeed_aio_config_t* config,
deepspeed_aio_perf_t* perf)
{
struct io_prep_generator io_gen(read_op, xfer_ctxt, aio_ctxt->_block_size);
#if DEBUG_DS_AIO_PERF
const auto io_op_name = std::string(read_op ? "read" : "write");
std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
<< " bytes with " << io_gen._num_io_blocks << " io blocks" << std::endl;
#endif
std::vector<std::chrono::duration<double>> submit_times;
std::vector<std::chrono::duration<double>> reap_times;
auto request_iocbs = aio_ctxt->_queue_depth;
auto n_pending_iocbs = 0;
const auto min_completes = 1;
auto start = std::chrono::high_resolution_clock::now();
while (true) {
const auto n_iocbs = io_gen.prep_iocbs(request_iocbs - n_pending_iocbs, &aio_ctxt->_iocbs);
if (n_iocbs > 0) {
if (config->_single_submit) {
_do_io_submit_singles(
n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
} else {
_do_io_submit_block(
n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
}
}
n_pending_iocbs += n_iocbs;
assert(n_pending_iocbs <= aio_ctxt->_queue_depth);
if (n_pending_iocbs == 0) { break; }
const auto n_complete =
_do_io_complete(min_completes, n_pending_iocbs, aio_ctxt, reap_times);
n_pending_iocbs -= n_complete;
}
const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
if (perf) {
_get_aio_latencies(submit_times, perf->_submit);
_get_aio_latencies(reap_times, perf->_complete);
perf->_e2e_usec = elapsed.count() * 1e6;
perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
}
#if DEBUG_DS_AIO_PERF
_report_aio_statistics("submit", submit_times);
_report_aio_statistics("complete", reap_times);
#endif
#if DEBUG_DS_AIO_PERF
std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
<< " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
#endif
#if DEBUG_DS_AIO_PERF
std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
<< " bytes " << std::endl;
#endif
}
void report_file_error(const char* filename, const std::string file_op, const int error_code)
{
std::string err_msg = file_op + std::string(" failed on ") + std::string(filename) +
" error = " + std::to_string(error_code);
std::cerr << c_library_name << ": " << err_msg << std::endl;
}
int open_file(const char* filename, const bool read_op)
{
const int flags = read_op ? (O_RDONLY | __O_DIRECT) : (O_WRONLY | O_CREAT | __O_DIRECT);
const int mode = 0600;
const auto fd = open(filename, flags, mode);
if (fd == -1) {
const auto error_code = errno;
const auto error_msg = read_op ? " open for read " : " open for write ";
report_file_error(filename, error_msg, error_code);
return -1;
}
return fd;
}
int regular_read(const char* filename, std::vector<char>& buffer)
{
long long int num_bytes;
const auto f_size = get_file_size(filename, num_bytes);
assert(f_size != -1);
buffer.resize(num_bytes);
const auto fd = open(filename, O_RDONLY, 0600);
assert(fd != -1);
long long int read_bytes = 0;
auto r = 0;
do {
const auto buffer_ptr = buffer.data() + read_bytes;
const auto bytes_to_read = num_bytes - read_bytes;
r = read(fd, buffer_ptr, bytes_to_read);
read_bytes += r;
} while (r > 0);
if (read_bytes != num_bytes) {
std::cerr << "read error "
<< " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
<< std::endl;
}
assert(read_bytes == num_bytes);
close(fd);
return 0;
}
static bool _validate_buffer(const char* filename, void* aio_buffer, const long long int num_bytes)
{
std::vector<char> regular_buffer;
const auto reg_ret = regular_read(filename, regular_buffer);
assert(0 == reg_ret);
std::cout << "regular read of " << filename << " returned " << regular_buffer.size() << " bytes"
<< std::endl;
if (static_cast<long long int>(regular_buffer.size()) != num_bytes) { return false; }
return (0 == memcmp(aio_buffer, regular_buffer.data(), regular_buffer.size()));
}
bool validate_aio_operation(const bool read_op,
const char* filename,
void* aio_buffer,
const long long int num_bytes)
{
const auto msg_suffix = std::string("deepspeed_aio_") +
std::string(read_op ? "read()" : "write()") +
std::string("using read()");
if (false == _validate_buffer(filename, aio_buffer, num_bytes)) {
std::cout << "Fail: correctness of " << msg_suffix << std::endl;
return false;
}
std::cout << "Pass: correctness of " << msg_suffix << std::endl;
return true;
}
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <deepspeed_aio_utils.h>
#include <stdlib.h>
#include <memory>
#include <string>
using namespace std;
void do_aio_operation_sequential(const bool read_op,
std::unique_ptr<aio_context>& aio_ctxt,
std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
deepspeed_aio_config_t* config,
deepspeed_aio_perf_t* perf);
void do_aio_operation_overlap(const bool read_op,
std::unique_ptr<aio_context>& aio_ctxt,
std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
deepspeed_aio_config_t* config,
deepspeed_aio_perf_t* perf);
int open_file(const char* filename, const bool read_op);
void report_file_error(const char* filename, const std::string file_op, const int error_code);
int regular_read(const char* filename, std::vector<char>& buffer);
bool validate_aio_operation(const bool read_op,
const char* filename,
void* aio_buffer,
const long long int num_bytes);
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <cmath>
#include "deepspeed_aio_utils.h"
using namespace std;
const int c_block_size = 128 * 1024;
const int c_io_queue_depth = 8;
deepspeed_aio_config_t::deepspeed_aio_config_t()
: _block_size(c_block_size),
_queue_depth(c_io_queue_depth),
_single_submit(false),
_overlap_events(false),
_lock_memory(false)
{
}
deepspeed_aio_config_t::deepspeed_aio_config_t(const int block_size,
const int queue_depth,
const bool single_submit,
const bool overlap_events,
const bool lock_memory)
: _block_size(block_size),
_queue_depth(queue_depth),
_single_submit(single_submit),
_overlap_events(overlap_events),
_lock_memory(lock_memory)
{
}
void deepspeed_aio_latency_t::dump(const std::string tag)
{
std::cout << tag << _min_usec << " " << _max_usec << " " << _avg_usec << " " << std::endl;
}
void deepspeed_aio_latency_t::accumulate(const struct deepspeed_aio_latency_t& other)
{
_min_usec += other._min_usec;
_max_usec += other._max_usec;
_avg_usec += other._avg_usec;
}
void deepspeed_aio_latency_t::scale(const float scaler)
{
_min_usec *= scaler;
_max_usec *= scaler;
_avg_usec *= scaler;
}
aio_context::aio_context(const int block_size, const int queue_depth)
{
_block_size = block_size;
_queue_depth = queue_depth;
for (auto i = 0; i < queue_depth; ++i) {
_iocbs.push_back((struct iocb*)calloc(1, sizeof(struct iocb)));
}
_io_events.resize(queue_depth);
io_queue_init(queue_depth, &_io_ctxt);
}
aio_context::~aio_context()
{
for (auto& iocb : _iocbs) { free(iocb); }
_io_events.resize(0);
io_queue_release(_io_ctxt);
}
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <libaio.h>
#include <stdlib.h>
#include <string>
#include <vector>
using namespace std;
struct deepspeed_aio_latency_t {
double _min_usec;
double _max_usec;
double _avg_usec;
void dump(const std::string tag);
void accumulate(const deepspeed_aio_latency_t&);
void scale(const float value);
};
struct deepspeed_aio_perf_t {
deepspeed_aio_latency_t _submit;
deepspeed_aio_latency_t _complete;
double _e2e_usec;
double _e2e_rate_GB;
};
struct deepspeed_aio_config_t {
const int _block_size;
const int _queue_depth;
const bool _single_submit;
const bool _overlap_events;
const bool _lock_memory;
deepspeed_aio_config_t();
deepspeed_aio_config_t(const int block_size,
const int queue_depth,
const bool single_submit,
const bool overlap_events,
const bool lock_memory);
};
struct aio_context {
io_context_t _io_ctxt;
std::vector<struct io_event> _io_events;
std::vector<struct iocb*> _iocbs;
int _block_size;
int _queue_depth;
aio_context(const int block_size, const int queue_depth);
~aio_context();
};
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <cmath>
#include "deepspeed_aio_utils.h"
using namespace std;
const int c_block_size = 128 * 1024;
const int c_io_queue_depth = 8;
io_xfer_ctxt::io_xfer_ctxt(const int fd,
const long long int file_offset,
const long long int num_bytes,
const void* buffer)
: _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes)
{
}
io_prep_context::io_prep_context(const bool read_op,
const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
const size_t block_size,
const std::vector<struct iocb*>* iocbs)
: _read_op(read_op), _xfer_ctxt(xfer_ctxt), _block_size(block_size), _iocbs(iocbs)
{
}
void io_prep_context::prep_iocbs(const int n_iocbs,
const size_t num_bytes,
const void* start_buffer,
const long long int start_offset)
{
assert(static_cast<size_t>(n_iocbs) <= _iocbs->size());
for (auto i = 0; i < n_iocbs; ++i) {
const auto shift = i * _block_size;
const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_base_offset + shift;
const auto xfer_offset = _xfer_ctxt->_base_offset + start_offset + shift;
auto byte_count = _block_size;
if ((shift + _block_size) > num_bytes) { byte_count = num_bytes - shift; }
if (_read_op) {
io_prep_pread(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
} else {
io_prep_pwrite(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
}
}
}
io_prep_generator::io_prep_generator(const bool read_op,
const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
const size_t block_size)
: _read_op(read_op),
_xfer_ctxt(xfer_ctxt),
_block_size(block_size),
_remaining_bytes(xfer_ctxt->_num_bytes),
_next_iocb_index(0)
{
_num_io_blocks =
static_cast<long long int>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
_remaining_io_blocks = _num_io_blocks;
}
int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs)
{
if ((_remaining_bytes) == 0 || (_remaining_io_blocks == 0)) {
assert(static_cast<long long int>(_remaining_bytes) == _remaining_io_blocks);
return 0;
}
assert(static_cast<size_t>(n_iocbs) <= iocbs->size());
auto actual_n_iocbs = min(static_cast<long long int>(n_iocbs), _remaining_io_blocks);
for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) {
const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size);
const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset;
const auto num_bytes = min(static_cast<long long int>(_block_size), _remaining_bytes);
if (_read_op) {
io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
} else {
io_prep_pwrite(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
}
_remaining_bytes -= num_bytes;
}
_remaining_io_blocks -= actual_n_iocbs;
return actual_n_iocbs;
}
int get_file_size(const char* filename, long long int& size)
{
struct stat st;
if (stat(filename, &st) == -1) { return -1; }
size = st.st_size;
return 0;
}
void* ds_page_aligned_alloc(const size_t size, const bool lock)
{
void* ptr;
int retval;
retval = posix_memalign(&ptr, (size_t)sysconf(_SC_PAGESIZE), size);
if (retval) { return nullptr; }
if (lock == false) { return ptr; }
auto mlock_ret = mlock(ptr, size);
if (mlock_ret != 0) {
auto mlock_error = errno;
printf("mlock failed with %d %s\n", mlock_error, strerror(mlock_error));
free(ptr);
return nullptr;
}
return ptr;
}
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#pragma once
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <libaio.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <deepspeed_aio_types.h>
#include <cstring>
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
struct io_xfer_ctxt {
const int _fd;
const long long int _base_offset;
const void* _mem_buffer;
const long long int _num_bytes;
io_xfer_ctxt(const int fd,
const long long int file_offset,
const long long int num_bytes,
const void* buffer);
};
struct io_prep_context {
const bool _read_op;
const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
const size_t _block_size;
const std::vector<struct iocb*>* _iocbs;
io_prep_context(const bool read_op,
const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
const size_t block_size,
const std::vector<struct iocb*>* iocbs);
void prep_iocbs(const int n_iocbs,
const size_t num_bytes,
const void* start_buffer,
const long long int start_offset);
};
struct io_prep_generator {
const bool _read_op;
const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
const size_t _block_size;
long long int _remaining_bytes;
long long int _num_io_blocks;
long long int _remaining_io_blocks;
long long int _next_iocb_index;
io_prep_generator(const bool read_op,
const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
const size_t block_size);
int prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs);
};
void* ds_page_aligned_alloc(const size_t size, const bool lock = false);
int get_file_size(const char* filename, long long int& size);
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include "deepspeed_aio_thread.h"
using namespace std;
io_op_desc_t::io_op_desc_t(const bool read_op,
const torch::Tensor& buffer,
const int fd,
const char* filename,
const long long int num_bytes,
const bool validate)
: _read_op(read_op),
_buffer(buffer),
_fd(fd),
_filename(filename),
_num_bytes(num_bytes),
_validate(validate)
{
_cpu_buffer = _buffer.is_cuda() ? _buffer.to(torch::kCPU).pin_memory() : _buffer;
_contiguous_buffer = _cpu_buffer.contiguous();
}
char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
void io_op_desc_t::fini()
{
if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
}
deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config)
: _tid(tid),
_aio_config(aio_config),
_aio_ctxt(new aio_context(aio_config._block_size, aio_config._queue_depth)),
_time_to_exit(false)
{
}
deepspeed_aio_thread_t::~deepspeed_aio_thread_t() {}
void deepspeed_aio_thread_t::run()
{
while (true) {
std::shared_ptr<struct io_op_desc_t> next_io_op = nullptr;
{
std::unique_lock<std::mutex> lock(_work_sync._mutex);
_work_sync._cond_var.wait(lock,
[this] { return (!_work_queue.empty() || _time_to_exit); });
if (!_work_queue.empty()) {
next_io_op = _work_queue.front();
_work_queue.pop();
}
}
if (next_io_op) {
const auto base_offset = next_io_op->_num_bytes * _tid;
std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
next_io_op->_fd, base_offset, next_io_op->_num_bytes, next_io_op->data_ptr()));
if (_aio_config._overlap_events) {
do_aio_operation_overlap(
next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
} else {
do_aio_operation_sequential(
next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
}
{
std::lock_guard<std::mutex> lock(_complete_sync._mutex);
_complete_queue.push(next_io_op);
}
_complete_sync._cond_var.notify_one();
}
if (_time_to_exit) { break; }
}
}
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <condition_variable>
#include <memory>
#include <queue>
#include "deepspeed_py_aio.h"
struct io_op_desc_t {
const bool _read_op;
torch::Tensor _buffer;
int _fd;
const std::string _filename;
const long long int _num_bytes;
torch::Tensor _cpu_buffer;
torch::Tensor _contiguous_buffer;
const bool _validate;
io_op_desc_t(const bool read_op,
const torch::Tensor& buffer,
const int fd,
const char* filename,
const long long int num_bytes,
const bool validate);
char* data_ptr() const;
void fini();
};
struct thread_sync_t {
std::mutex _mutex;
std::condition_variable _cond_var;
};
struct deepspeed_aio_thread_t {
const int _tid;
deepspeed_aio_config_t& _aio_config;
std::unique_ptr<struct aio_context> _aio_ctxt;
std::queue<std::shared_ptr<struct io_op_desc_t>> _work_queue;
std::queue<std::shared_ptr<struct io_op_desc_t>> _complete_queue;
bool _time_to_exit;
struct thread_sync_t _work_sync;
struct thread_sync_t _complete_sync;
deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config);
~deepspeed_aio_thread_t();
void run();
};
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <cassert>
#include <chrono>
#include <cstring>
#include <fstream>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include "deepspeed_py_aio.h"
using namespace std;
using namespace std::chrono;
#define DEBUG_DS_AIO_READ 0
#define DEBUG_DS_AIO_WRITE 0
static const std::string c_library_name = "deepspeed_aio";
int deepspeed_py_aio_write(const torch::Tensor& buffer,
const char* filename,
const int block_size,
const int queue_depth,
const bool single_submit,
const bool overlap_events,
const bool validate)
{
const auto start_time = std::chrono::high_resolution_clock::now();
deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
const auto fd = open_file(filename, false);
if (fd == -1) { return -1; }
auto write_buffer = (char*)buffer.data_ptr();
const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
if (config._overlap_events) {
do_aio_operation_overlap(false, aio_ctxt, xfer_ctxt, &config, nullptr);
} else {
do_aio_operation_sequential(false, aio_ctxt, xfer_ctxt, &config, nullptr);
}
const std::chrono::duration<double> aio_time =
std::chrono::high_resolution_clock::now() - start_time;
close(fd);
if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
const std::chrono::duration<double> fn_time =
std::chrono::high_resolution_clock::now() - start_time;
std::cout << "Elapsed time(usec): "
<< "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
<< std::endl;
return 0;
}
int deepspeed_py_aio_read(torch::Tensor& buffer,
const char* filename,
const int block_size,
const int queue_depth,
const bool single_submit,
const bool overlap_events,
const bool validate)
{
const auto start_time = std::chrono::high_resolution_clock::now();
long long num_file_bytes;
if (-1 == get_file_size(filename, num_file_bytes)) {
const auto error_code = errno;
report_file_error(filename, " fstat for read", error_code);
return -1;
}
deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
const auto fd = open_file(filename, true);
if (fd == -1) { return -1; }
auto read_buffer = (char*)buffer.data_ptr();
assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
if (config._overlap_events) {
do_aio_operation_overlap(true, aio_ctxt, xfer_ctxt, &config, nullptr);
} else {
do_aio_operation_sequential(true, aio_ctxt, xfer_ctxt, &config, nullptr);
}
const std::chrono::duration<double> aio_time =
std::chrono::high_resolution_clock::now() - start_time;
close(fd);
if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
const std::chrono::duration<double> fn_time =
std::chrono::high_resolution_clock::now() - start_time;
std::cout << "Elapsed time(usec): "
<< "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
<< std::endl;
return 0;
}
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <deepspeed_aio_common.h>
#include <stdlib.h>
#include <torch/extension.h>
int deepspeed_py_aio_write(const torch::Tensor& buffer,
const char* filename,
const int block_size,
const int queue_depth,
const bool single_submit,
const bool overlap_events,
const bool validate);
int deepspeed_py_aio_read(torch::Tensor& buffer,
const char* filename,
const int block_size,
const int queue_depth,
const bool single_submit,
const bool overlap_events,
const bool validate);
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include "deepspeed_py_aio_handle.h"
using namespace std;
static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
const int queue_depth,
const bool single_submit,
const bool overlap_events,
const int num_threads)
: _aio_ctxt(new aio_context(block_size, queue_depth)),
_single_submit(single_submit),
_overlap_events(overlap_events),
_num_threads(num_threads),
_aio_config(block_size, queue_depth, single_submit, overlap_events, false),
_num_pending_ops(0)
{
for (auto i = 0; i < num_threads; ++i) {
_thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
}
for (auto& ctxt : _thread_contexts) {
_threads.push_back(std::thread(_start_aio_thread, ctxt));
}
}
deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
{
_stop_threads();
for (auto& thr : _threads) { thr.join(); }
}
const int deepspeed_aio_handle_t::get_block_size() const
{
return _aio_ctxt ? _aio_ctxt->_block_size : -1;
}
const int deepspeed_aio_handle_t::get_queue_depth() const
{
return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
}
const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_submit; }
const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; }
const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; }
int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
{
const auto start_time = std::chrono::high_resolution_clock::now();
assert(_aio_ctxt);
long long num_file_bytes;
if (-1 == get_file_size(filename, num_file_bytes)) {
const auto error_code = errno;
report_file_error(filename, " fstat for read", error_code);
return -1;
}
assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
const auto fd = open_file(filename, true);
if (fd == -1) { return -1; }
auto read_buffer = (char*)buffer.data_ptr();
std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
if (_aio_config._overlap_events) {
do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
} else {
do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
}
close(fd);
const std::chrono::duration<double> aio_time =
std::chrono::high_resolution_clock::now() - start_time;
if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
const std::chrono::duration<double> fn_time =
std::chrono::high_resolution_clock::now() - start_time;
std::cout << "Elapsed time(usec): "
<< "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
<< std::endl;
return 0;
}
int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
const char* filename,
const bool validate)
{
assert(_aio_ctxt);
const auto start_time = std::chrono::high_resolution_clock::now();
const auto fd = open_file(filename, false);
if (fd == -1) { return -1; }
auto write_buffer = (char*)buffer.data_ptr();
const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
if (_aio_config._overlap_events) {
do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
} else {
do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
}
const std::chrono::duration<double> aio_time =
std::chrono::high_resolution_clock::now() - start_time;
close(fd);
if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
const std::chrono::duration<double> fn_time =
std::chrono::high_resolution_clock::now() - start_time;
std::cout << "Elapsed time(usec): "
<< "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
<< std::endl;
return 0;
}
void deepspeed_aio_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
{
for (auto& ctxt : _thread_contexts) {
{
std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
ctxt->_work_queue.push(scheduled_op);
}
ctxt->_work_sync._cond_var.notify_one();
}
_num_pending_ops++;
}
std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_wait_for_aio_work()
{
std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
for (auto& ctxt : _thread_contexts) {
std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
ctxt->_complete_sync._cond_var.wait(lock,
[ctxt] { return !ctxt->_complete_queue.empty(); });
completed_op = ctxt->_complete_queue.front();
ctxt->_complete_queue.pop();
}
return completed_op;
}
void deepspeed_aio_handle_t::_stop_threads()
{
assert(0 == _num_pending_ops);
for (auto& ctxt : _thread_contexts) {
{
std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
ctxt->_time_to_exit = true;
}
ctxt->_work_sync._cond_var.notify_one();
}
}
int deepspeed_aio_handle_t::wait()
{
assert(_num_pending_ops > 0);
auto num_completed_ops = 0;
while (_num_pending_ops > 0) {
auto completed_op = _wait_for_aio_work();
completed_op->fini();
close(completed_op->_fd);
if (completed_op->_validate) {
validate_aio_operation(completed_op->_read_op,
completed_op->_filename.c_str(),
completed_op->data_ptr(),
_num_threads * completed_op->_num_bytes);
}
--_num_pending_ops;
++num_completed_ops;
}
return num_completed_ops;
}
bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
const long long int num_bytes)
{
const auto op_string = read_op ? "Read" : "Write";
if (num_bytes % get_thread_count()) {
std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
<< " not divisible by thread count = " << get_thread_count() << std::endl;
return false;
}
return true;
}
int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
const char* filename,
const bool validate,
const bool async)
{
long long num_file_bytes;
if (-1 == get_file_size(filename, num_file_bytes)) {
const auto error_code = errno;
report_file_error(filename, " fstat for read", error_code);
return -1;
}
const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
if (buffer_bytes != num_file_bytes) {
std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
<< " != " << num_file_bytes << std::endl;
}
assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
assert((num_file_bytes % _num_threads) == 0);
if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
const auto fd = open_file(filename, true);
if (fd == -1) { return -1; }
auto scheduled_op = std::make_shared<io_op_desc_t>(
true, buffer, fd, filename, (num_file_bytes / _num_threads), validate);
_schedule_aio_work(scheduled_op);
if (async) { return 0; }
return wait();
}
int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
const char* filename,
const bool validate,
const bool async)
{
const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
assert((num_write_bytes % _num_threads) == 0);
if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
const auto fd = open_file(filename, false);
if (fd == -1) { return -1; }
auto scheduled_op = std::make_shared<io_op_desc_t>(
false, buffer, fd, filename, (num_write_bytes / _num_threads), validate);
_schedule_aio_work(scheduled_op);
if (async) { return 0; }
return wait();
}
int deepspeed_aio_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
{
return pread(buffer, filename, false, false);
}
int deepspeed_aio_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
{
return pwrite(buffer, filename, false, false);
}
int deepspeed_aio_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
{
return pread(buffer, filename, false, true);
}
int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
{
return pwrite(buffer, filename, false, true);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment