Commit 67ea635f authored by aiss's avatar aiss
Browse files

push dsv0.8.2 version

parent 1b2721ad
Pipeline #201 failed with stages
in 0 seconds
'''Copyright The Microsoft DeepSpeed Team'''
from . import adam
from . import adagrad
from . import lamb
......
'''Copyright The Microsoft DeepSpeed Team'''
from .cpu_adagrad import DeepSpeedCPUAdagrad
......@@ -2,11 +2,8 @@
Copyright 2020 The Microsoft DeepSpeed Team
'''
import math
import torch
import time
from pathlib import Path
from ..op_builder import CPUAdagradBuilder
from deepspeed.ops.op_builder import CPUAdagradBuilder
from deepspeed.utils.logging import should_log_le
......
'''Copyright The Microsoft DeepSpeed Team'''
from .cpu_adam import DeepSpeedCPUAdam
from .fused_adam import FusedAdam
......@@ -2,12 +2,11 @@
Copyright 2020 The Microsoft DeepSpeed Team
'''
import math
import torch
import time
from pathlib import Path
from ..op_builder import CPUAdamBuilder
from cpuinfo import get_cpu_info
from deepspeed.utils import logger
from deepspeed.utils.logging import should_log_le
from deepspeed.ops.op_builder import CPUAdamBuilder
class DeepSpeedCPUAdam(torch.optim.Optimizer):
......@@ -76,6 +75,20 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer):
amsgrad=amsgrad)
super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)
cpu_info = get_cpu_info()
self.cpu_vendor = cpu_info["vendor_id_raw"].lower(
) if "vendor_id_raw" in cpu_info else "unknown"
if "amd" in self.cpu_vendor:
for group_id, group in enumerate(self.param_groups):
for param_id, p in enumerate(group['params']):
if p.dtype == torch.half:
logger.warning(
"FP16 params for CPUAdam may not work on AMD CPUs")
break
else:
continue
break
self.opt_id = DeepSpeedCPUAdam.optimizer_id
DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
self.adam_w_mode = adamw_mode
......
......@@ -6,11 +6,11 @@ This file is adapted from fused adam in NVIDIA/apex, commit a109f85
'''
import torch
import importlib
from .multi_tensor_apply import MultiTensorApply
multi_tensor_applier = MultiTensorApply(2048 * 32)
from ..op_builder import FusedAdamBuilder
from deepspeed.accelerator import get_accelerator
from deepspeed.ops.op_builder import FusedAdamBuilder
class FusedAdam(torch.optim.Optimizer):
......@@ -72,7 +72,7 @@ class FusedAdam(torch.optim.Optimizer):
fused_adam_cuda = FusedAdamBuilder().load()
# Skip buffer
self._dummy_overflow_buf = torch.cuda.IntTensor([0])
self._dummy_overflow_buf = get_accelerator().IntTensor([0])
self.multi_tensor_adam = fused_adam_cuda.multi_tensor_adam
def zero_grad(self):
......@@ -109,12 +109,8 @@ class FusedAdam(torch.optim.Optimizer):
bias_correction = 1 if group['bias_correction'] else 0
beta1, beta2 = group['betas']
# assume same step across group now to simplify things
# per parameter step can be easily support by making it tensor, or pass list into kernel
if 'step' in group:
group['step'] += 1
else:
group['step'] = 1
if 'step' not in group:
group['step'] = 0
# create lists for multi-tensor apply
g_16, p_16, m_16, v_16 = [], [], [], []
......@@ -131,6 +127,10 @@ class FusedAdam(torch.optim.Optimizer):
state = self.state[p]
# State initialization
if len(state) == 0:
# DeepSpeed ZeRO 3 processes each subgroup a time, so we need to keep tracking step count for each tensor separately.
# While this is not an issue for ZeRO 1 & 2, since they apply a single optimizatin step to the whole param group at the same time.
# In order to keep backward compatibility for the existing checkpoints, we use group['state'] to initialize state['step'] if it exists.
state['step'] = group.get('step', 0)
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
......@@ -150,6 +150,7 @@ class FusedAdam(torch.optim.Optimizer):
raise RuntimeError('FusedAdam only support fp16 and fp32.')
if (len(g_16) > 0):
state['step'] += 1
multi_tensor_applier(self.multi_tensor_adam,
self._dummy_overflow_buf,
[g_16,
......@@ -160,11 +161,12 @@ class FusedAdam(torch.optim.Optimizer):
beta1,
beta2,
group['eps'],
group['step'],
state['step'],
self.adam_w_mode,
bias_correction,
group['weight_decay'])
if (len(g_32) > 0):
state['step'] += 1
multi_tensor_applier(self.multi_tensor_adam,
self._dummy_overflow_buf,
[g_32,
......@@ -175,7 +177,7 @@ class FusedAdam(torch.optim.Optimizer):
beta1,
beta2,
group['eps'],
group['step'],
state['step'],
self.adam_w_mode,
bias_correction,
group['weight_decay'])
......
......@@ -4,7 +4,6 @@ Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from NVIDIA/apex, commit a109f85
'''
import torch
class MultiTensorApply(object):
......
File mode changed from 100644 to 100755
../../csrc
\ No newline at end of file
#include "cpu_adagrad.h"
#include <cuda_runtime_api.h>
#include <math.h>
#include <omp.h>
#include <torch/extension.h>
#include <iostream>
#include <memory>
#include <type_traits>
#include <unordered_map>
#include "cublas_v2.h"
#include "cuda.h"
#include "curand.h"
#include "custom_cuda_layers.h"
static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
// C++ interface
void Adagrad_Optimizer::Step_1(float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<1>(
&rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
#endif
if (_param_size > rounded_size) {
float step_size = -1 * _alpha;
__half* grads_cast_h;
__half* params_cast_h;
if (half_precision) {
grads_cast_h = reinterpret_cast<__half*>(grads);
params_cast_h = reinterpret_cast<__half*>(_params);
}
for (size_t t = rounded_size; t < _param_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > _param_size) copy_size = _param_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t k = t; k < offset; k++) {
float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
float param = half_precision ? (float)params_cast_h[k] : _params[k];
float momentum = grads[k];
float variance = _exp_avg_sq[k];
if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
variance += grad * grad;
grad = sqrt(variance);
grad += _eps;
grad = momentum / grad;
param = grad * step_size + param;
if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
if (half_precision)
params_cast_h[k] = (__half)param;
else
_params[k] = param;
// STORE UPDATE TERM TO GRAD'S MEMORY
grads[k] = grad * step_size;
_exp_avg_sq[k] = variance;
}
if (dev_params) {
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
}
}
void Adagrad_Optimizer::Step_4(float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<4>(
&rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
#endif
if (_param_size > rounded_size)
Step_1((_params + rounded_size),
(grads + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int create_adagrad_optimizer(int optimizer_id,
float alpha = 1e-2,
float eps = 1e-8,
float weight_decay = 0,
bool should_log = false)
{
auto opt = std::make_shared<Adagrad_Optimizer>(alpha, eps, weight_decay);
s_optimizers[optimizer_id] = opt;
if (should_log) {
std::string avx_type = "";
#if defined(__AVX512__)
avx_type = "AVX512";
#else
#if defined(__AVX256__)
avx_type = "AVX2";
#else
avx_type = "scalar";
#endif
#endif
printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n",
optimizer_id,
avx_type.c_str());
printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay);
}
return 0;
}
void Adagrad_Optimizer::Step_8(float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<8>(
&rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
#endif
if (_param_size > rounded_size)
Step_4((_params + rounded_size),
(grads + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int ds_adagrad_step(int optimizer_id,
size_t step,
float lr,
float epsilon,
float weight_decay,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg_sq)
{
auto params_c = params.contiguous();
auto grads_c = grads.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adagrad_Optimizer> opt =
std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step);
opt->update_state(lr, epsilon, weight_decay);
opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
opt->SynchronizeStreams();
return 0;
}
int ds_adagrad_step_plus_copy(int optimizer_id,
size_t step,
float lr,
float epsilon,
float weight_decay,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg_sq,
torch::Tensor& gpu_params)
{
auto params_c = params.contiguous();
auto gpu_params_c = gpu_params.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
auto grads_c = grads.contiguous();
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
__half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adagrad_Optimizer> opt =
std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step);
opt->update_state(lr, epsilon, weight_decay);
opt->Step_8(params_ptr,
grads_ptr,
exp_avg_sq_ptr,
params_c.size(0),
gpu_params_ptr,
(params.options().dtype() == at::kHalf));
opt->SynchronizeStreams();
return 0;
}
int destroy_adagrad_optimizer(int optimizer_id)
{
s_optimizers.erase(optimizer_id);
return 0;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
m.def("adagrad_update_copy",
&ds_adagrad_step_plus_copy,
"DeepSpeed CPU Adagrad update and param copy (C++)");
m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
}
// !!! This is a file automatically generated by hipify!!!
#include "cpu_adagrad_hip.h"
#include <hip/hip_runtime_api.h>
#include <math.h>
#include <omp.h>
#include <torch/extension.h>
#include <iostream>
#include <memory>
#include <type_traits>
#include <unordered_map>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand/hiprand.h"
#include "custom_hip_layers.h"
static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
// C++ interface
void Adagrad_Optimizer::Step_1(float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<1>(
&rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
#endif
if (_param_size > rounded_size) {
float step_size = -1 * _alpha;
__half* grads_cast_h;
__half* params_cast_h;
if (half_precision) {
grads_cast_h = reinterpret_cast<__half*>(grads);
params_cast_h = reinterpret_cast<__half*>(_params);
}
for (size_t t = rounded_size; t < _param_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > _param_size) copy_size = _param_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t k = t; k < offset; k++) {
float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
float param = half_precision ? (float)params_cast_h[k] : _params[k];
float momentum = grads[k];
float variance = _exp_avg_sq[k];
if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
variance += grad * grad;
grad = sqrt(variance);
grad += _eps;
grad = momentum / grad;
param = grad * step_size + param;
if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
if (half_precision)
params_cast_h[k] = (__half)param;
else
_params[k] = param;
// STORE UPDATE TERM TO GRAD'S MEMORY
grads[k] = grad * step_size;
_exp_avg_sq[k] = variance;
}
if (dev_params) {
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
}
}
void Adagrad_Optimizer::Step_4(float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<4>(
&rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
#endif
if (_param_size > rounded_size)
Step_1((_params + rounded_size),
(grads + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int create_adagrad_optimizer(int optimizer_id,
float alpha = 1e-2,
float eps = 1e-8,
float weight_decay = 0,
bool should_log = false)
{
auto opt = std::make_shared<Adagrad_Optimizer>(alpha, eps, weight_decay);
s_optimizers[optimizer_id] = opt;
if (should_log) {
std::string avx_type = "";
#if defined(__AVX512__)
avx_type = "AVX512";
#else
#if defined(__AVX256__)
avx_type = "AVX2";
#else
avx_type = "scalar";
#endif
#endif
printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n",
optimizer_id,
avx_type.c_str());
printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay);
}
return 0;
}
void Adagrad_Optimizer::Step_8(float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<8>(
&rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
#endif
if (_param_size > rounded_size)
Step_4((_params + rounded_size),
(grads + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int ds_adagrad_step(int optimizer_id,
size_t step,
float lr,
float epsilon,
float weight_decay,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg_sq)
{
auto params_c = params.contiguous();
auto grads_c = grads.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adagrad_Optimizer> opt =
std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step);
opt->update_state(lr, epsilon, weight_decay);
opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
opt->SynchronizeStreams();
return 0;
}
int ds_adagrad_step_plus_copy(int optimizer_id,
size_t step,
float lr,
float epsilon,
float weight_decay,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg_sq,
torch::Tensor& gpu_params)
{
auto params_c = params.contiguous();
auto gpu_params_c = gpu_params.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
auto grads_c = grads.contiguous();
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
__half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adagrad_Optimizer> opt =
std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step);
opt->update_state(lr, epsilon, weight_decay);
opt->Step_8(params_ptr,
grads_ptr,
exp_avg_sq_ptr,
params_c.size(0),
gpu_params_ptr,
(params.options().dtype() == at::kHalf));
opt->SynchronizeStreams();
return 0;
}
int destroy_adagrad_optimizer(int optimizer_id)
{
s_optimizers.erase(optimizer_id);
return 0;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
m.def("adagrad_update_copy",
&ds_adagrad_step_plus_copy,
"DeepSpeed CPU Adagrad update and param copy (C++)");
m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
}
#include "cpu_adam.h"
#include <cuda_runtime_api.h>
#include <math.h>
#include <omp.h>
#include <torch/extension.h>
#include <iostream>
#include <memory>
#include <type_traits>
#include <unordered_map>
#include "cublas_v2.h"
#include "cuda.h"
#include "curand.h"
#include "custom_cuda_layers.h"
static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
// C++ interface
void Adam_Optimizer::Step_1(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<1>(&rounded_size,
_params,
grads,
_exp_avg,
_exp_avg_sq,
_param_size,
dev_params,
half_precision);
#endif
if (_param_size > rounded_size) {
float betta1_minus1 = 1 - _betta1;
float betta2_minus1 = 1 - _betta2;
float step_size = -1 * _alpha / _bias_correction1;
float w_decay = -1 * _alpha * _weight_decay;
__half* grads_cast_h;
__half* params_cast_h;
if (half_precision) {
grads_cast_h = reinterpret_cast<__half*>(grads);
params_cast_h = reinterpret_cast<__half*>(_params);
}
for (size_t t = rounded_size; t < _param_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > _param_size) copy_size = _param_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t k = t; k < offset; k++) {
float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
float param = half_precision ? (float)params_cast_h[k] : _params[k];
float momentum = _exp_avg[k];
float variance = _exp_avg_sq[k];
if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
momentum = momentum * _betta1;
momentum = grad * betta1_minus1 + momentum;
variance = variance * _betta2;
grad = grad * grad;
variance = grad * betta2_minus1 + variance;
grad = sqrt(variance);
grad = grad * _bias_correction2 + _eps;
grad = momentum / grad;
if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
param = grad * step_size + param;
if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
if (half_precision)
params_cast_h[k] = (__half)param;
else
_params[k] = param;
_exp_avg[k] = momentum;
_exp_avg_sq[k] = variance;
}
if (dev_params) {
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
}
}
void Adam_Optimizer::Step_4(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<4>(&rounded_size,
_params,
grads,
_exp_avg,
_exp_avg_sq,
_param_size,
dev_params,
half_precision);
#endif
if (_param_size > rounded_size)
Step_1((_params + rounded_size),
(grads + rounded_size),
(_exp_avg + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int create_adam_optimizer(int optimizer_id,
float alpha = 1e-3,
float betta1 = 0.9,
float betta2 = 0.999,
float eps = 1e-8,
float weight_decay = 0,
bool adamw_mode = true,
bool should_log = false)
{
auto opt =
std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
s_optimizers[optimizer_id] = opt;
if (should_log) {
std::string avx_type = "";
#if defined(__AVX512__)
avx_type = "AVX512";
#else
#if defined(__AVX256__)
avx_type = "AVX2";
#else
avx_type = "scalar";
#endif
#endif
printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
optimizer_id,
avx_type.c_str());
printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
alpha,
betta1,
betta2,
weight_decay,
(int)adamw_mode);
}
return 0;
}
void Adam_Optimizer::Step_8(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<8>(&rounded_size,
_params,
grads,
_exp_avg,
_exp_avg_sq,
_param_size,
dev_params,
half_precision);
#endif
if (_param_size > rounded_size)
Step_4((_params + rounded_size),
(grads + rounded_size),
(_exp_avg + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int ds_adam_step(int optimizer_id,
size_t step,
float lr,
float beta1,
float beta2,
float epsilon,
float weight_decay,
bool bias_correction,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg,
torch::Tensor& exp_avg_sq)
{
auto params_c = params.contiguous();
auto grads_c = grads.contiguous();
auto exp_avg_c = exp_avg.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
// assert(params.options().dtype() == grads.options().dtype());
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adam_Optimizer> opt =
std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step, beta1, beta2);
opt->update_state(lr, epsilon, weight_decay, bias_correction);
opt->Step_8(params_ptr,
grads_ptr,
exp_avg_ptr,
exp_avg_sq_ptr,
params_c.size(0),
nullptr,
(params.options().dtype() == at::kHalf));
opt->SynchronizeStreams();
return 0;
}
int ds_adam_step_plus_copy(int optimizer_id,
size_t step,
float lr,
float beta1,
float beta2,
float epsilon,
float weight_decay,
bool bias_correction,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg,
torch::Tensor& exp_avg_sq,
torch::Tensor& gpu_params)
{
auto params_c = params.contiguous();
auto gpu_params_c = gpu_params.contiguous();
auto exp_avg_c = exp_avg.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
auto grads_c = grads.contiguous();
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
__half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adam_Optimizer> opt =
std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step, beta1, beta2);
opt->update_state(lr, epsilon, weight_decay, bias_correction);
opt->Step_8(params_ptr,
grads_ptr,
exp_avg_ptr,
exp_avg_sq_ptr,
params_c.size(0),
gpu_params_ptr,
(params.options().dtype() == at::kHalf));
opt->SynchronizeStreams();
return 0;
}
int destroy_adam_optimizer(int optimizer_id)
{
s_optimizers.erase(optimizer_id);
return 0;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
m.def("adam_update_copy",
&ds_adam_step_plus_copy,
"DeepSpeed CPU Adam update and param copy (C++)");
m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
}
// !!! This is a file automatically generated by hipify!!!
#include "cpu_adam_hip.h"
#include <hip/hip_runtime_api.h>
#include <math.h>
#include <omp.h>
#include <torch/extension.h>
#include <iostream>
#include <memory>
#include <type_traits>
#include <unordered_map>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand/hiprand.h"
#include "custom_hip_layers.h"
static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
// C++ interface
void Adam_Optimizer::Step_1(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<1>(&rounded_size,
_params,
grads,
_exp_avg,
_exp_avg_sq,
_param_size,
dev_params,
half_precision);
#endif
if (_param_size > rounded_size) {
float betta1_minus1 = 1 - _betta1;
float betta2_minus1 = 1 - _betta2;
float step_size = -1 * _alpha / _bias_correction1;
float w_decay = -1 * _alpha * _weight_decay;
__half* grads_cast_h;
__half* params_cast_h;
if (half_precision) {
grads_cast_h = reinterpret_cast<__half*>(grads);
params_cast_h = reinterpret_cast<__half*>(_params);
}
for (size_t t = rounded_size; t < _param_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > _param_size) copy_size = _param_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t k = t; k < offset; k++) {
float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
float param = half_precision ? (float)params_cast_h[k] : _params[k];
float momentum = _exp_avg[k];
float variance = _exp_avg_sq[k];
if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
momentum = momentum * _betta1;
momentum = grad * betta1_minus1 + momentum;
variance = variance * _betta2;
grad = grad * grad;
variance = grad * betta2_minus1 + variance;
grad = sqrt(variance);
grad = grad * _bias_correction2 + _eps;
grad = momentum / grad;
if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
param = grad * step_size + param;
if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
if (half_precision)
params_cast_h[k] = (__half)param;
else
_params[k] = param;
_exp_avg[k] = momentum;
_exp_avg_sq[k] = variance;
}
if (dev_params) {
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
}
}
void Adam_Optimizer::Step_4(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<4>(&rounded_size,
_params,
grads,
_exp_avg,
_exp_avg_sq,
_param_size,
dev_params,
half_precision);
#endif
if (_param_size > rounded_size)
Step_1((_params + rounded_size),
(grads + rounded_size),
(_exp_avg + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int create_adam_optimizer(int optimizer_id,
float alpha = 1e-3,
float betta1 = 0.9,
float betta2 = 0.999,
float eps = 1e-8,
float weight_decay = 0,
bool adamw_mode = true,
bool should_log = false)
{
auto opt =
std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);
s_optimizers[optimizer_id] = opt;
if (should_log) {
std::string avx_type = "";
#if defined(__AVX512__)
avx_type = "AVX512";
#else
#if defined(__AVX256__)
avx_type = "AVX2";
#else
avx_type = "scalar";
#endif
#endif
printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
optimizer_id,
avx_type.c_str());
printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
alpha,
betta1,
betta2,
weight_decay,
(int)adamw_mode);
}
return 0;
}
void Adam_Optimizer::Step_8(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<8>(&rounded_size,
_params,
grads,
_exp_avg,
_exp_avg_sq,
_param_size,
dev_params,
half_precision);
#endif
if (_param_size > rounded_size)
Step_4((_params + rounded_size),
(grads + rounded_size),
(_exp_avg + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}
int ds_adam_step(int optimizer_id,
size_t step,
float lr,
float beta1,
float beta2,
float epsilon,
float weight_decay,
bool bias_correction,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg,
torch::Tensor& exp_avg_sq)
{
auto params_c = params.contiguous();
auto grads_c = grads.contiguous();
auto exp_avg_c = exp_avg.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
// assert(params.options().dtype() == grads.options().dtype());
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adam_Optimizer> opt =
std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step, beta1, beta2);
opt->update_state(lr, epsilon, weight_decay, bias_correction);
opt->Step_8(params_ptr,
grads_ptr,
exp_avg_ptr,
exp_avg_sq_ptr,
params_c.size(0),
nullptr,
(params.options().dtype() == at::kHalf));
opt->SynchronizeStreams();
return 0;
}
int ds_adam_step_plus_copy(int optimizer_id,
size_t step,
float lr,
float beta1,
float beta2,
float epsilon,
float weight_decay,
bool bias_correction,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg,
torch::Tensor& exp_avg_sq,
torch::Tensor& gpu_params)
{
auto params_c = params.contiguous();
auto gpu_params_c = gpu_params.contiguous();
auto exp_avg_c = exp_avg.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
auto grads_c = grads.contiguous();
float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
__half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr();
float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adam_Optimizer> opt =
std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step, beta1, beta2);
opt->update_state(lr, epsilon, weight_decay, bias_correction);
opt->Step_8(params_ptr,
grads_ptr,
exp_avg_ptr,
exp_avg_sq_ptr,
params_c.size(0),
gpu_params_ptr,
(params.options().dtype() == at::kHalf));
opt->SynchronizeStreams();
return 0;
}
int destroy_adam_optimizer(int optimizer_id)
{
s_optimizers.erase(optimizer_id);
return 0;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
m.def("adam_update_copy",
&ds_adam_step_plus_copy,
"DeepSpeed CPU Adam update and param copy (C++)");
m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
}
#include <torch/extension.h>
void multi_tensor_adam_cuda(int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists,
const float lr,
const float beta1,
const float beta2,
const float epsilon,
const int step,
const int mode,
const int bias_correction,
const float weight_decay);
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("multi_tensor_adam",
&multi_tensor_adam_cuda,
"Compute and apply gradient update to parameters for Adam optimizer");
}
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
// Another possibility:
// #include <torch/all.h>
#include <assert.h>
#include "multi_tensor_apply.cuh"
#include "type_shim.h"
#define BLOCK_SIZE 512
#define ILP 4
typedef enum {
ADAM_MODE_0 = 0, // L2 regularization mode
ADAM_MODE_1 = 1 // Decoupled weight decay mode(AdamW)
} adamMode_t;
using MATH_T = float;
template <typename T>
struct AdamFunctor {
__device__ __forceinline__ void operator()(int chunk_size,
volatile int* noop_gmem,
TensorListMetadata<4>& tl,
const float beta1,
const float beta2,
const float beta1_correction,
const float beta2_correction,
const float epsilon,
const float lr,
adamMode_t mode,
const float decay)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
int tensor_loc = tl.block_to_tensor[blockIdx.x];
// potentially use to pass in list of scalar
// int tensor_num = tl.start_tensor_this_launch + tensor_loc;
int chunk_idx = tl.block_to_chunk[blockIdx.x];
int n = tl.sizes[tensor_loc];
T* g = (T*)tl.addresses[0][tensor_loc];
g += chunk_idx * chunk_size;
T* p = (T*)tl.addresses[1][tensor_loc];
p += chunk_idx * chunk_size;
T* m = (T*)tl.addresses[2][tensor_loc];
m += chunk_idx * chunk_size;
T* v = (T*)tl.addresses[3][tensor_loc];
v += chunk_idx * chunk_size;
n -= chunk_idx * chunk_size;
// see note in multi_tensor_scale_kernel.cu
for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
MATH_T r_g[ILP];
MATH_T r_p[ILP];
MATH_T r_m[ILP];
MATH_T r_v[ILP];
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
int i = i_start + threadIdx.x + ii * blockDim.x;
if (i < n && i < chunk_size) {
r_g[ii] = g[i];
r_p[ii] = p[i];
r_m[ii] = m[i];
r_v[ii] = v[i];
} else {
r_g[ii] = MATH_T(0);
r_p[ii] = MATH_T(0);
r_m[ii] = MATH_T(0);
r_v[ii] = MATH_T(0);
}
}
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
if (mode == ADAM_MODE_0) { // L2
r_g[ii] = r_g[ii] + (decay * r_p[ii]);
r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
MATH_T update = next_m_unbiased / denom;
r_p[ii] = r_p[ii] - (lr * update);
} else { // weight decay
r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
r_p[ii] = r_p[ii] - (lr * update);
}
}
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
int i = i_start + threadIdx.x + ii * blockDim.x;
if (i < n && i < chunk_size) {
p[i] = r_p[ii];
m[i] = r_m[ii];
v[i] = r_v[ii];
}
}
}
}
};
void multi_tensor_adam_cuda(int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists,
const float lr,
const float beta1,
const float beta2,
const float epsilon,
const int step,
const int mode,
const int bias_correction,
const float weight_decay)
{
using namespace at;
// Handle bias correction mode
float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
if (bias_correction == 1) {
bias_correction1 = 1 - std::pow(beta1, step);
bias_correction2 = 1 - std::pow(beta2, step);
}
// Assume single type across p,g,m1,m2 now
DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
0,
"adam",
multi_tensor_apply<4>(BLOCK_SIZE,
chunk_size,
noop_flag,
tensor_lists,
AdamFunctor<scalar_t_0>(),
beta1,
beta2,
bias_correction1,
bias_correction2,
epsilon,
lr,
(adamMode_t)mode,
weight_decay);)
AT_CUDA_CHECK(cudaGetLastError());
}
// !!! This is a file automatically generated by hipify!!!
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/Exceptions.h>
// Another possibility:
// #include <torch/all.h>
#include <assert.h>
#include "multi_tensor_apply_hip.cuh"
#include "type_shim_hip.h"
#define BLOCK_SIZE 512
#define ILP 4
typedef enum {
ADAM_MODE_0 = 0, // L2 regularization mode
ADAM_MODE_1 = 1 // Decoupled weight decay mode(AdamW)
} adamMode_t;
using MATH_T = float;
template <typename T>
struct AdamFunctor {
__device__ __forceinline__ void operator()(int chunk_size,
volatile int* noop_gmem,
TensorListMetadata<4>& tl,
const float beta1,
const float beta2,
const float beta1_correction,
const float beta2_correction,
const float epsilon,
const float lr,
adamMode_t mode,
const float decay)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
int tensor_loc = tl.block_to_tensor[blockIdx.x];
// potentially use to pass in list of scalar
// int tensor_num = tl.start_tensor_this_launch + tensor_loc;
int chunk_idx = tl.block_to_chunk[blockIdx.x];
int n = tl.sizes[tensor_loc];
T* g = (T*)tl.addresses[0][tensor_loc];
g += chunk_idx * chunk_size;
T* p = (T*)tl.addresses[1][tensor_loc];
p += chunk_idx * chunk_size;
T* m = (T*)tl.addresses[2][tensor_loc];
m += chunk_idx * chunk_size;
T* v = (T*)tl.addresses[3][tensor_loc];
v += chunk_idx * chunk_size;
n -= chunk_idx * chunk_size;
// see note in multi_tensor_scale_kernel.cu
for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
MATH_T r_g[ILP];
MATH_T r_p[ILP];
MATH_T r_m[ILP];
MATH_T r_v[ILP];
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
int i = i_start + threadIdx.x + ii * blockDim.x;
if (i < n && i < chunk_size) {
r_g[ii] = g[i];
r_p[ii] = p[i];
r_m[ii] = m[i];
r_v[ii] = v[i];
} else {
r_g[ii] = MATH_T(0);
r_p[ii] = MATH_T(0);
r_m[ii] = MATH_T(0);
r_v[ii] = MATH_T(0);
}
}
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
if (mode == ADAM_MODE_0) { // L2
r_g[ii] = r_g[ii] + (decay * r_p[ii]);
r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
MATH_T update = next_m_unbiased / denom;
r_p[ii] = r_p[ii] - (lr * update);
} else { // weight decay
r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
r_p[ii] = r_p[ii] - (lr * update);
}
}
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
int i = i_start + threadIdx.x + ii * blockDim.x;
if (i < n && i < chunk_size) {
p[i] = r_p[ii];
m[i] = r_m[ii];
v[i] = r_v[ii];
}
}
}
}
};
void multi_tensor_adam_cuda(int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists,
const float lr,
const float beta1,
const float beta2,
const float epsilon,
const int step,
const int mode,
const int bias_correction,
const float weight_decay)
{
using namespace at;
// Handle bias correction mode
float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
if (bias_correction == 1) {
bias_correction1 = 1 - ::pow(beta1, step);
bias_correction2 = 1 - ::pow(beta2, step);
}
// Assume single type across p,g,m1,m2 now
DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
0,
"adam",
multi_tensor_apply<4>(BLOCK_SIZE,
chunk_size,
noop_flag,
tensor_lists,
AdamFunctor<scalar_t_0>(),
beta1,
beta2,
bias_correction1,
bias_correction2,
epsilon,
lr,
(adamMode_t)mode,
weight_decay);)
AT_CUDA_CHECK(hipGetLastError());
}
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
#include "compat.h"
#include <assert.h>
// #include <iostream>
// This header is the one-stop shop for all your multi-tensor apply needs.
// TODO: Kernel arg size limit may be <4KB for some other cards (ie Jetson)
constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
template <int n>
struct TensorListMetadata {
void* addresses[n][depth_to_max_tensors[n - 1]];
int sizes[depth_to_max_tensors[n - 1]];
unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a full int.
int start_tensor_this_launch;
};
template <typename T, typename U, typename... ArgTypes>
__global__ void multi_tensor_apply_kernel(int chunk_size,
volatile int* noop_flag,
T tl,
U callable,
ArgTypes... args)
{
// Hand the chunk information to the user-supplied functor to process however it likes.
callable(chunk_size, noop_flag, tl, args...);
}
template <int depth, typename T, typename... ArgTypes>
void multi_tensor_apply(int block_size,
int chunk_size,
const at::Tensor& noop_flag,
const std::vector<std::vector<at::Tensor>>& tensor_lists,
T callable,
ArgTypes... args)
{
TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
int len0 = tensor_lists[0].size();
TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
auto ref_device = tensor_lists[0][0].device();
TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
{
TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
for (int t = 0; t < tensor_lists[l].size(); t++) {
// TODO: Print which tensor fails.
bool contiguous_memory = tensor_lists[l][t].is_contiguous();
#ifdef VERSION_GE_1_5
contiguous_memory = (contiguous_memory ||
tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
#endif
TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
"A tensor was not on the same device as the first tensor");
TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
}
}
int ntensors = tensor_lists[0].size();
TensorListMetadata<depth> tl;
const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
auto stream = at::cuda::getCurrentCUDAStream();
tl.start_tensor_this_launch = 0;
int loc_block_info = 0;
int loc_tensor_info = 0;
for (int t = 0; t < ntensors; t++) {
tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
for (int d = 0; d < depth; d++)
tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
loc_tensor_info++;
int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
// std::cout << chunks_this_tensor << std::endl;
tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
tl.block_to_chunk[loc_block_info] = chunk;
loc_block_info++;
bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
chunk == chunks_this_tensor - 1);
bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
if (tensors_full || blocks_full || last_chunk) {
// using accscalar_t = acc_type<scalar_t, true>;
multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
AT_CUDA_CHECK(cudaGetLastError());
// Reset. The control flow possibilities here make my brain hurt.
loc_block_info = 0;
if (chunk == chunks_this_tensor - 1) {
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
// std::endl;
loc_tensor_info = 0;
tl.start_tensor_this_launch = t + 1;
} else {
// std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
// std::endl;
tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
for (int d = 0; d < depth; d++)
tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
loc_tensor_info = 1;
tl.start_tensor_this_launch = t;
}
}
}
}
}
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/Exceptions.h>
#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
#include "compat.h"
#include <assert.h>
// #include <iostream>
// This header is the one-stop shop for all your multi-tensor apply needs.
// TODO: Kernel arg size limit may be <4KB for some other cards (ie Jetson)
constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
template <int n>
struct TensorListMetadata {
void* addresses[n][depth_to_max_tensors[n - 1]];
int sizes[depth_to_max_tensors[n - 1]];
unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
int block_to_chunk[depth_to_max_blocks[n - 1]]; // I fear this needs to be a full int.
int start_tensor_this_launch;
};
template <typename T, typename U, typename... ArgTypes>
__global__ void multi_tensor_apply_kernel(int chunk_size,
volatile int* noop_flag,
T tl,
U callable,
ArgTypes... args)
{
// Hand the chunk information to the user-supplied functor to process however it likes.
callable(chunk_size, noop_flag, tl, args...);
}
template <int depth, typename T, typename... ArgTypes>
void multi_tensor_apply(int block_size,
int chunk_size,
const at::Tensor& noop_flag,
const std::vector<std::vector<at::Tensor>>& tensor_lists,
T callable,
ArgTypes... args)
{
TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
int len0 = tensor_lists[0].size();
TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
auto ref_device = tensor_lists[0][0].device();
TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
{
TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
for (int t = 0; t < tensor_lists[l].size(); t++) {
// TODO: Print which tensor fails.
bool contiguous_memory = tensor_lists[l][t].is_contiguous();
#ifdef VERSION_GE_1_5
contiguous_memory = (contiguous_memory ||
tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
#endif
TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
"A tensor was not on the same device as the first tensor");
TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
}
}
int ntensors = tensor_lists[0].size();
TensorListMetadata<depth> tl;
const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(tensor_lists[0][0]));
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
tl.start_tensor_this_launch = 0;
int loc_block_info = 0;
int loc_tensor_info = 0;
for (int t = 0; t < ntensors; t++) {
tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
for (int d = 0; d < depth; d++)
tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
loc_tensor_info++;
int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
// std::cout << chunks_this_tensor << std::endl;
tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
tl.block_to_chunk[loc_block_info] = chunk;
loc_block_info++;
bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
chunk == chunks_this_tensor - 1);
bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
if (tensors_full || blocks_full || last_chunk) {
// using accscalar_t = acc_type<scalar_t, true>;
hipLaunchKernelGGL(( multi_tensor_apply_kernel), dim3(loc_block_info), dim3(block_size), 0, stream,
chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
AT_CUDA_CHECK(hipGetLastError());
// Reset. The control flow possibilities here make my brain hurt.
loc_block_info = 0;
if (chunk == chunks_this_tensor - 1) {
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
// std::endl;
loc_tensor_info = 0;
tl.start_tensor_this_launch = t + 1;
} else {
// std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
// std::endl;
tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
for (int d = 0; d < depth; d++)
tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
loc_tensor_info = 1;
tl.start_tensor_this_launch = t;
}
}
}
}
}
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <libaio.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstring>
#include <fstream>
#include <iostream>
#include <limits>
#include <numeric>
#include <string>
#include <vector>
#include "deepspeed_aio_common.h"
using namespace std;
using namespace std::chrono;
#define DEBUG_DS_AIO_PERF 0
#define DEBUG_DS_AIO_SUBMIT_PERF 0
static const std::string c_library_name = "deepspeed_aio";
static void _report_aio_statistics(const char* tag,
const std::vector<std::chrono::duration<double>>& latencies)
__attribute__((unused));
static void _report_aio_statistics(const char* tag,
const std::vector<std::chrono::duration<double>>& latencies)
{
std::vector<double> lat_usec;
for (auto& lat : latencies) { lat_usec.push_back(lat.count() * 1e6); }
const auto min_lat = *(std::min_element(lat_usec.begin(), lat_usec.end()));
const auto max_lat = *(std::max_element(lat_usec.begin(), lat_usec.end()));
const auto avg_lat = std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
std::cout << c_library_name << ": latency statistics(usec) " << tag
<< " min/max/avg = " << min_lat << " " << max_lat << " " << avg_lat << std::endl;
}
static void _get_aio_latencies(std::vector<std::chrono::duration<double>>& raw_latencies,
struct deepspeed_aio_latency_t& summary_latencies)
{
std::vector<double> lat_usec;
for (auto& lat : raw_latencies) { lat_usec.push_back(lat.count() * 1e6); }
summary_latencies._min_usec = *(std::min_element(lat_usec.begin(), lat_usec.end()));
summary_latencies._max_usec = *(std::max_element(lat_usec.begin(), lat_usec.end()));
summary_latencies._avg_usec =
std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
}
static void _do_io_submit_singles(const long long int n_iocbs,
const long long int iocb_index,
std::unique_ptr<aio_context>& aio_ctxt,
std::vector<std::chrono::duration<double>>& submit_times)
{
for (auto i = 0; i < n_iocbs; ++i) {
const auto st = std::chrono::high_resolution_clock::now();
const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, 1, aio_ctxt->_iocbs.data() + i);
submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
#if DEBUG_DS_AIO_SUBMIT_PERF
printf("submit(usec) %f io_index=%lld buf=%p len=%lu off=%llu \n",
submit_times.back().count() * 1e6,
iocb_index,
aio_ctxt->_iocbs[i]->u.c.buf,
aio_ctxt->_iocbs[i]->u.c.nbytes,
aio_ctxt->_iocbs[i]->u.c.offset);
#endif
assert(submit_ret > 0);
}
}
static void _do_io_submit_block(const long long int n_iocbs,
const long long int iocb_index,
std::unique_ptr<aio_context>& aio_ctxt,
std::vector<std::chrono::duration<double>>& submit_times)
{
const auto st = std::chrono::high_resolution_clock::now();
const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, n_iocbs, aio_ctxt->_iocbs.data());
submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
#if DEBUG_DS_AIO_SUBMIT_PERF
printf("submit(usec) %f io_index=%lld nr=%lld buf=%p len=%lu off=%llu \n",
submit_times.back().count() * 1e6,
iocb_index,
n_iocbs,
aio_ctxt->_iocbs[0]->u.c.buf,
aio_ctxt->_iocbs[0]->u.c.nbytes,
aio_ctxt->_iocbs[0]->u.c.offset);
#endif
assert(submit_ret > 0);
}
static int _do_io_complete(const long long int min_completes,
const long long int max_completes,
std::unique_ptr<aio_context>& aio_ctxt,
std::vector<std::chrono::duration<double>>& reap_times)
{
const auto start_time = std::chrono::high_resolution_clock::now();
const auto n_completes = io_getevents(
aio_ctxt->_io_ctxt, min_completes, max_completes, aio_ctxt->_io_events.data(), nullptr);
reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
assert(n_completes >= min_completes);
return n_completes;
}
void do_aio_operation_sequential(const bool read_op,
std::unique_ptr<aio_context>& aio_ctxt,
std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
deepspeed_aio_config_t* config,
deepspeed_aio_perf_t* perf)
{
struct io_prep_context prep_ctxt(read_op, xfer_ctxt, aio_ctxt->_block_size, &aio_ctxt->_iocbs);
const auto num_io_blocks = static_cast<long long int>(
ceil(static_cast<double>(xfer_ctxt->_num_bytes) / aio_ctxt->_block_size));
#if DEBUG_DS_AIO_PERF
const auto io_op_name = std::string(read_op ? "read" : "write");
std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
<< " bytes with " << num_io_blocks << " io blocks" << std::endl;
#endif
std::vector<std::chrono::duration<double>> submit_times;
std::vector<std::chrono::duration<double>> reap_times;
const auto max_queue_bytes =
static_cast<long long int>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
auto start = std::chrono::high_resolution_clock::now();
for (long long iocb_index = 0; iocb_index < num_io_blocks;
iocb_index += aio_ctxt->_queue_depth) {
const auto start_offset = iocb_index * aio_ctxt->_block_size;
const auto start_buffer = (char*)xfer_ctxt->_mem_buffer + start_offset;
const auto n_iocbs =
min(static_cast<long long>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
const auto num_bytes = min(max_queue_bytes, (xfer_ctxt->_num_bytes - start_offset));
prep_ctxt.prep_iocbs(n_iocbs, num_bytes, start_buffer, start_offset);
if (config->_single_submit) {
_do_io_submit_singles(n_iocbs, iocb_index, aio_ctxt, submit_times);
} else {
_do_io_submit_block(n_iocbs, iocb_index, aio_ctxt, submit_times);
}
_do_io_complete(n_iocbs, n_iocbs, aio_ctxt, reap_times);
}
const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
if (perf) {
_get_aio_latencies(submit_times, perf->_submit);
_get_aio_latencies(reap_times, perf->_complete);
perf->_e2e_usec = elapsed.count() * 1e6;
perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
}
#if DEBUG_DS_AIO_PERF
_report_aio_statistics("submit", submit_times);
_report_aio_statistics("complete", reap_times);
#endif
#if DEBUG_DS_AIO_PERF
std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
<< " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
#endif
#if DEBUG_DS_AIO_PERF
std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
<< " bytes " << std::endl;
#endif
}
void do_aio_operation_overlap(const bool read_op,
std::unique_ptr<aio_context>& aio_ctxt,
std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
deepspeed_aio_config_t* config,
deepspeed_aio_perf_t* perf)
{
struct io_prep_generator io_gen(read_op, xfer_ctxt, aio_ctxt->_block_size);
#if DEBUG_DS_AIO_PERF
const auto io_op_name = std::string(read_op ? "read" : "write");
std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
<< " bytes with " << io_gen._num_io_blocks << " io blocks" << std::endl;
#endif
std::vector<std::chrono::duration<double>> submit_times;
std::vector<std::chrono::duration<double>> reap_times;
auto request_iocbs = aio_ctxt->_queue_depth;
auto n_pending_iocbs = 0;
const auto min_completes = 1;
auto start = std::chrono::high_resolution_clock::now();
while (true) {
const auto n_iocbs = io_gen.prep_iocbs(request_iocbs - n_pending_iocbs, &aio_ctxt->_iocbs);
if (n_iocbs > 0) {
if (config->_single_submit) {
_do_io_submit_singles(
n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
} else {
_do_io_submit_block(
n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
}
}
n_pending_iocbs += n_iocbs;
assert(n_pending_iocbs <= aio_ctxt->_queue_depth);
if (n_pending_iocbs == 0) { break; }
const auto n_complete =
_do_io_complete(min_completes, n_pending_iocbs, aio_ctxt, reap_times);
n_pending_iocbs -= n_complete;
}
const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
if (perf) {
_get_aio_latencies(submit_times, perf->_submit);
_get_aio_latencies(reap_times, perf->_complete);
perf->_e2e_usec = elapsed.count() * 1e6;
perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
}
#if DEBUG_DS_AIO_PERF
_report_aio_statistics("submit", submit_times);
_report_aio_statistics("complete", reap_times);
#endif
#if DEBUG_DS_AIO_PERF
std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
<< " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
#endif
#if DEBUG_DS_AIO_PERF
std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
<< " bytes " << std::endl;
#endif
}
void report_file_error(const char* filename, const std::string file_op, const int error_code)
{
std::string err_msg = file_op + std::string(" failed on ") + std::string(filename) +
" error = " + std::to_string(error_code);
std::cerr << c_library_name << ": " << err_msg << std::endl;
}
int open_file(const char* filename, const bool read_op)
{
const int flags = read_op ? (O_RDONLY | __O_DIRECT) : (O_WRONLY | O_CREAT | __O_DIRECT);
const int mode = 0600;
const auto fd = open(filename, flags, mode);
if (fd == -1) {
const auto error_code = errno;
const auto error_msg = read_op ? " open for read " : " open for write ";
report_file_error(filename, error_msg, error_code);
return -1;
}
return fd;
}
int regular_read(const char* filename, std::vector<char>& buffer)
{
long long int num_bytes;
const auto f_size = get_file_size(filename, num_bytes);
assert(f_size != -1);
buffer.resize(num_bytes);
const auto fd = open(filename, O_RDONLY, 0600);
assert(fd != -1);
long long int read_bytes = 0;
auto r = 0;
do {
const auto buffer_ptr = buffer.data() + read_bytes;
const auto bytes_to_read = num_bytes - read_bytes;
r = read(fd, buffer_ptr, bytes_to_read);
read_bytes += r;
} while (r > 0);
if (read_bytes != num_bytes) {
std::cerr << "read error "
<< " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
<< std::endl;
}
assert(read_bytes == num_bytes);
close(fd);
return 0;
}
static bool _validate_buffer(const char* filename, void* aio_buffer, const long long int num_bytes)
{
std::vector<char> regular_buffer;
const auto reg_ret = regular_read(filename, regular_buffer);
assert(0 == reg_ret);
std::cout << "regular read of " << filename << " returned " << regular_buffer.size() << " bytes"
<< std::endl;
if (static_cast<long long int>(regular_buffer.size()) != num_bytes) { return false; }
return (0 == memcmp(aio_buffer, regular_buffer.data(), regular_buffer.size()));
}
bool validate_aio_operation(const bool read_op,
const char* filename,
void* aio_buffer,
const long long int num_bytes)
{
const auto msg_suffix = std::string("deepspeed_aio_") +
std::string(read_op ? "read()" : "write()") +
std::string("using read()");
if (false == _validate_buffer(filename, aio_buffer, num_bytes)) {
std::cout << "Fail: correctness of " << msg_suffix << std::endl;
return false;
}
std::cout << "Pass: correctness of " << msg_suffix << std::endl;
return true;
}
/*
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/
#include <deepspeed_aio_utils.h>
#include <stdlib.h>
#include <memory>
#include <string>
using namespace std;
void do_aio_operation_sequential(const bool read_op,
std::unique_ptr<aio_context>& aio_ctxt,
std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
deepspeed_aio_config_t* config,
deepspeed_aio_perf_t* perf);
void do_aio_operation_overlap(const bool read_op,
std::unique_ptr<aio_context>& aio_ctxt,
std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
deepspeed_aio_config_t* config,
deepspeed_aio_perf_t* perf);
int open_file(const char* filename, const bool read_op);
void report_file_error(const char* filename, const std::string file_op, const int error_code);
int regular_read(const char* filename, std::vector<char>& buffer);
bool validate_aio_operation(const bool read_op,
const char* filename,
void* aio_buffer,
const long long int num_bytes);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment