Commit 67ea635f authored by aiss's avatar aiss
Browse files

push dsv0.8.2 version

parent 1b2721ad
Pipeline #201 failed with stages
in 0 seconds
'''Copyright The Microsoft DeepSpeed Team'''
import torch
import time
import deepspeed
import argparse
from transformers import pipeline
from deepspeed.accelerator import get_accelerator
parser = argparse.ArgumentParser()
parser.add_argument("--model", "-m", type=str, help="hf model name")
parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
parser.add_argument("--dtype", type=str, default="fp16", help="fp16 or fp32")
parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
parser.add_argument("--local_rank", type=int, default=0, help="local rank")
parser.add_argument("--trials", type=int, default=30, help="number of trials")
parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
args = parser.parse_args()
def print_latency(latency_set, title, warmup=3):
# trim warmup queries
latency_set = latency_set[warmup:]
count = len(latency_set)
if count > 0:
latency_set.sort()
n50 = (count - 1) * 0.5 + 1
n90 = (count - 1) * 0.9 + 1
n95 = (count - 1) * 0.95 + 1
n99 = (count - 1) * 0.99 + 1
n999 = (count - 1) * 0.999 + 1
avg = sum(latency_set) / count
p50 = latency_set[int(n50) - 1]
p90 = latency_set[int(n90) - 1]
p95 = latency_set[int(n95) - 1]
p99 = latency_set[int(n99) - 1]
p999 = latency_set[int(n999) - 1]
print(f"====== latency stats {title} ======")
print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
deepspeed.init_distributed()
print(args.model, args.max_tokens, args.dtype)
if args.dtype.lower() == "fp16":
dtype = torch.float16
else:
dtype = torch.float32
pipe = pipeline("fill-mask", model=args.model, framework="pt", device=args.local_rank)
if dtype == torch.half:
pipe.model.half()
mask = pipe.tokenizer.mask_token
br = pipe(f"Hello I'm a {mask} model")
if args.deepspeed:
pipe.model = deepspeed.init_inference(pipe.model,
dtype=dtype,
mp_size=1,
replace_with_kernel_inject=args.kernel_inject,
enable_cuda_graph=args.graphs)
pipe.model.profile_model_time()
responses = []
times = []
mtimes = []
for i in range(args.trials):
get_accelerator().synchronize()
start = time.time()
r = pipe(f"Hello I'm a {mask} model")
get_accelerator().synchronize()
end = time.time()
responses.append(r)
times.append((end - start))
mtimes += pipe.model.model_times()
#print(f"{pipe.model.model_times()=}")
print_latency(times, "e2e latency")
print_latency(mtimes, "model latency")
print(responses[0:3])
'''Copyright The Microsoft DeepSpeed Team'''
import os
import re
import argparse
import pandas as pd
parser = argparse.ArgumentParser()
parser.add_argument(
"--results-dir",
"-r",
type=str,
default="./results",
help="directory containing sweep results",
)
parser.add_argument("--version",
"-v",
type=int,
default=0,
help="version to be collected")
parser.add_argument("--gen-text-n",
"-n",
type=int,
default=1,
help="expected number of generated text")
parser.add_argument("--output",
"-o",
type=str,
default="./results.csv",
help="output file")
args = parser.parse_args()
def get_branch(file_path):
match = re.match(r".*\/(.*)\.log", file_path)
if match is None:
return False
else:
return match.groups()[0]
def get_benchmark_params(root_dir, file_path):
match = re.match(
rf"{root_dir}\/(.+?)_(fp\d+)_(true|false)_(true|false)_(\d+)gpus_v(\d+)\/",
file_path,
)
if match is None:
return False
else:
model, dtype, graphs, kernel, gpus, version = match.groups()
bool_dict = {"true": True, "false": False}
return {
"model": model,
"dtype": dtype,
"graphs": bool_dict[graphs.lower()],
"kernel": bool_dict[kernel.lower()],
"gpus": int(gpus),
"version": int(version),
}
def get_perf_data(file_content):
matches = re.findall(r"\s+(.+?)\sLatency:\s+(\d+\.\d+)\sms", file_content)
if matches is []:
return False
else:
return {f"latency-{key}": float(val) for key, val in matches}
def get_generated_text(file_content, gen_text_n):
file_content = file_content.replace("\n", " ")
file_content = file_content.replace("\t", " ")
matches = re.findall(r"RESPONSE\s(\d+):\s+[-]{30}\s+(.+?)\s+[-]{30}", file_content)
if len(matches) != gen_text_n:
return False
else:
return {f"generated-text-{key}": val for key, val in matches}
def get_error(file_content):
matches = re.findall(r"Error:\s+(.+?)\n", file_content)
if matches is []:
return False
else:
return {f"error": val for val in matches}
if __name__ == "__main__":
# List to collect data from all benchmarks
benchmarks_data = []
# Walk through directory of results from sweep.sh
for root, dirs, files in os.walk(args.results_dir):
# Because of how some models are named, the dir structure for results can vary, e.g.:
# "EleutherAI/gpt-neo_*/baseline.log" versus "gpt2_*/baseline.log"
if dirs:
continue
# Get data from baseline and each tested branch
for name in files:
file_path = os.path.join(root, name)
branch = get_branch(file_path)
if not branch:
print(f"WARNING: Could not detect branch for file {file_path}, skipping")
continue
params = get_benchmark_params(args.results_dir, file_path)
if not params:
print(
f"WARNING: Could not detect benchmark settings for file {file_path}, skipping"
)
continue
# Verify that the version matches that which we want to collect
if params["version"] != args.version:
continue
with open(file_path, "r") as f:
file_content = f.read()
perf_data = get_perf_data(file_content)
if not perf_data:
print(
f"WARNING: Could not detect benchmark performance data for file {file_path}"
)
generated_text = get_generated_text(file_content, args.gen_text_n)
if not generated_text:
print(f"WARNING: Could not detect generated text for file {file_path}")
error = get_error(file_content)
if error:
print(f"Error found in {file_path}, collecting error info...")
benchmarks_data.append({"branch": branch, **params, **error})
continue
benchmarks_data.append({
"branch": branch,
**params,
**perf_data,
**generated_text
})
# Convert to a DataFrame and save
benchmarks_df = pd.DataFrame(benchmarks_data)
benchmarks_df.to_csv(args.output)
'''Copyright The Microsoft DeepSpeed Team'''
import os
import torch
import time
import deepspeed
import argparse
from transformers import pipeline
from deepspeed.accelerator import get_accelerator
parser = argparse.ArgumentParser()
parser.add_argument("--model", "-m", type=str, help="hf model name")
parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
parser.add_argument("--dtype",
type=str,
default="fp16",
choices=["fp16",
"fp32",
"int8"],
help="int8, fp16, or fp32")
parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
parser.add_argument("--local_rank",
type=int,
default=int(os.getenv("LOCAL_RANK",
"0")),
help="local rank")
parser.add_argument("--world_size",
type=int,
default=int(os.getenv("WORLD_SIZE",
"1")),
help="world size")
parser.add_argument("--trials", type=int, default=30, help="number of trials")
args = parser.parse_args()
def print_latency(latency_set, title, warmup=3):
# trim warmup queries
latency_set = list(latency_set)
latency_set = latency_set[warmup:]
count = len(latency_set)
if count > 0:
latency_set.sort()
n50 = (count - 1) * 0.5 + 1
n90 = (count - 1) * 0.9 + 1
n95 = (count - 1) * 0.95 + 1
n99 = (count - 1) * 0.99 + 1
n999 = (count - 1) * 0.999 + 1
avg = sum(latency_set) / count
p50 = latency_set[int(n50) - 1]
p90 = latency_set[int(n90) - 1]
p95 = latency_set[int(n95) - 1]
p99 = latency_set[int(n99) - 1]
p999 = latency_set[int(n999) - 1]
print(f"====== latency stats {title} ======")
print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
deepspeed.init_distributed()
if args.local_rank == 0:
print("BENCHMARK SETTINGS:")
print(f"\tMODEL: {args.model}")
print(f"\tMAX_TOKENS: {args.max_tokens}")
print(f"\tDTYPE: {args.dtype}")
print(f"\tCUDA_GRAPHS: {args.graphs}")
print(f"\tKERNEL_INJECT: {args.kernel_inject}")
if args.dtype == "int8":
dtype = torch.int8
elif args.dtype == "fp16":
dtype = torch.float16
else:
dtype = torch.float32
pipe = pipeline("text-generation",
model=args.model,
framework="pt",
device=args.local_rank)
if dtype == torch.float16:
pipe.model.half()
if args.deepspeed:
pipe.model = deepspeed.init_inference(
pipe.model,
dtype=dtype,
mp_size=args.world_size,
replace_with_kernel_inject=args.kernel_inject,
enable_cuda_graph=args.graphs,
)
pipe.model.profile_model_time()
responses = []
times = []
mtimes = []
for i in range(args.trials):
get_accelerator().synchronize()
start = time.time()
r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=args.max_tokens)
get_accelerator().synchronize()
end = time.time()
responses.append(r)
times.append(end - start) # / (args.max_tokens - 3))
mtimes.append(sum(pipe.model.model_times()))
if args.local_rank == 0:
print_latency(times, "(e2e) latency")
print_latency(mtimes, "(model-only) latency")
print_latency(map(lambda t: t / (args.max_tokens - 3),
times),
"(e2e) per token latency")
print(f"RESPONSE 0:")
print("-" * 30)
print(responses[0][0]["generated_text"])
print("-" * 30)
set -x
model=$1
branch1=$2
branch2=$3
dtype=$4
graphs=$5
kernel=$6
gpus=$7
version=0
log_path=results/${model}_${dtype}_${graphs}_${kernel}_${gpus}gpus_v${version}
mkdir -p ${log_path}
params="--dtype $dtype "
if [[ "$graphs" == "true" ]]; then
params+="--graphs "
fi
if [[ "$kernel" == "true" ]]; then
params+="--kernel "
fi
echo "baseline $log_path"
deepspeed --num_gpus 1 gpt-bench.py -m "${model}" $params &> ${log_path}/baseline.log
cd ../../
git checkout ${branch1}
cd -
echo "ds ${branch1} $log_path"
deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params &> ${log_path}/ds-${branch1}.log
cd ../../
git checkout ${branch2}
cd -
echo "ds ${branch2} $log_path"
deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params&> ${log_path}/ds-${branch2}.log
set -x
export TRANSFORMERS_CACHE=/tmp/hf-cache
branch1=$1
branch2=$2
gptneo_models="EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neo-1.3B EleutherAI/gpt-neo-125M"
gpt2_models="gpt2 gpt2-large gpt2-xl"
gptj_models="EleutherAI/gpt-j-6B"
opt_models="facebook/opt-125m facebook/opt-1.3b facebook/opt-2.7b facebook/opt-6.7b facebook/opt-13b"
bloom_models="bigscience/bloom-560m bigscience/bloom-1b7 bigscience/bloom-3b bigscience/bloom-7b1"
for gpus in `echo "1 2 4 8"`; do
for dtype in `echo "fp16 fp32"`; do
for graphs in `echo "true false"`; do
for kernel in `echo "true false"`; do
params="$dtype $graphs $kernel $gpus"
for m in `echo "$gptneo_models"`; do
bash run_model.sh $m $branch1 $branch2 $params
done
for m in `echo "$gpt2_models"`; do
bash run_model.sh $m $branch1 $branch2 $params
done
for m in `echo "$gptj_models"`; do
bash run_model.sh $m $branch1 $branch2 $params
done
for m in `echo "$opt_models"`; do
bash run_model.sh $m $branch1 $branch2 $params
done
for m in `echo "$bloom_models"`; do
bash run_model.sh $m $branch1 $branch2 $params
done
done
done
done
done
#!/usr/bin/env python3
from deepspeed.launcher.runner import main
if __name__ == '__main__':
main()
ds
\ No newline at end of file
#!/usr/bin/env python3
from deepspeed.launcher.runner import main
if __name__ == '__main__':
main()
ds
\ No newline at end of file
File mode changed from 100644 to 100755
#!/usr/bin/env python3
from benchmarks.communication.run_all import main
from benchmarks.communication.constants import *
from benchmarks.communication.utils import *
import os
import sys
# Run the same file with deepspeed launcher. This is required since setuptools will auto-detect python files and insert a python shebang for both 'scripts' and 'entry_points', and this benchmarks require the DS launcher
required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
if not all(map(lambda v: v in os.environ, required_env)):
import subprocess
subprocess.run("deepspeed $(which ds_bench) " + " ".join(sys.argv[1:]), shell=True)
else:
args = benchmark_parser().parse_args()
rank = args.local_rank
main(args, rank)
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
ds_report
\ No newline at end of file
@echo off
set DS_BUILD_AIO=0
set DS_BUILD_SPARSE_ATTN=0
echo Administrative permissions required. Detecting permissions...
net session >nul 2>&1
if %errorLevel% == 0 (
echo Success: Administrative permissions confirmed.
) else (
echo Failure: Current permissions inadequate.
goto end
)
python setup.py bdist_wheel
:end
#ifdef __HIPCC__
#include "cpu_adagrad_hip.h"
#else
#include "cpu_adagrad.h" #include "cpu_adagrad.h"
#include <cuda_runtime_api.h> #endif
#include <math.h>
#include <omp.h>
#include <torch/extension.h> #include <torch/extension.h>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include <type_traits> #include <type_traits>
#include <unordered_map> #include <unordered_map>
#if defined(__ENABLE_CUDA__)
#include <cuda_runtime_api.h>
#include "cublas_v2.h" #include "cublas_v2.h"
#include "cuda.h" #include "cuda.h"
#include "curand.h" #include "curand.h"
#include "custom_cuda_layers.h" #include "custom_cuda_layers.h"
#endif
static std::unordered_map<int, std::shared_ptr<void>> s_optimizers; static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
...@@ -20,7 +25,7 @@ void Adagrad_Optimizer::Step_1(float* _params, ...@@ -20,7 +25,7 @@ void Adagrad_Optimizer::Step_1(float* _params,
float* grads, float* grads,
float* _exp_avg_sq, float* _exp_avg_sq,
size_t _param_size, size_t _param_size,
__half* dev_params, ds_half_precision_t* dev_params,
bool half_precision) bool half_precision)
{ {
size_t rounded_size = 0; size_t rounded_size = 0;
...@@ -30,17 +35,19 @@ void Adagrad_Optimizer::Step_1(float* _params, ...@@ -30,17 +35,19 @@ void Adagrad_Optimizer::Step_1(float* _params,
#endif #endif
if (_param_size > rounded_size) { if (_param_size > rounded_size) {
float step_size = -1 * _alpha; float step_size = -1 * _alpha;
__half* grads_cast_h; ds_half_precision_t* grads_cast_h;
__half* params_cast_h; ds_half_precision_t* params_cast_h;
if (half_precision) { if (half_precision) {
grads_cast_h = reinterpret_cast<__half*>(grads); grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
params_cast_h = reinterpret_cast<__half*>(_params); params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
} }
for (size_t t = rounded_size; t < _param_size; t += TILE) { for (size_t t = rounded_size; t < _param_size; t += TILE) {
size_t copy_size = TILE; size_t copy_size = TILE;
if ((t + TILE) > _param_size) copy_size = _param_size - t; if ((t + TILE) > _param_size) copy_size = _param_size - t;
size_t offset = copy_size + t; size_t offset = copy_size + t;
#if defined(__ENABLE_CUDA__)
if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); } if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
#endif
#pragma omp parallel for #pragma omp parallel for
for (size_t k = t; k < offset; k++) { for (size_t k = t; k < offset; k++) {
float grad = half_precision ? (float)grads_cast_h[k] : grads[k]; float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
...@@ -55,21 +62,24 @@ void Adagrad_Optimizer::Step_1(float* _params, ...@@ -55,21 +62,24 @@ void Adagrad_Optimizer::Step_1(float* _params,
grad += _eps; grad += _eps;
grad = momentum / grad; grad = momentum / grad;
param = grad * step_size + param; param = grad * step_size + param;
#if defined(__ENABLE_CUDA__)
if (dev_params) _doubled_buffer[_buf_index][k - t] = param; if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
#endif
if (half_precision) if (half_precision)
params_cast_h[k] = (__half)param; params_cast_h[k] = (ds_half_precision_t)param;
else else
_params[k] = param; _params[k] = param;
// STORE UPDATE TERM TO GRAD'S MEMORY // STORE UPDATE TERM TO GRAD'S MEMORY
grads[k] = grad * step_size; grads[k] = grad * step_size;
_exp_avg_sq[k] = variance; _exp_avg_sq[k] = variance;
} }
#if defined(__ENABLE_CUDA__)
if (dev_params) { if (dev_params) {
launch_param_update( launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]); _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
_buf_index = !_buf_index; _buf_index = !_buf_index;
} }
#endif
} }
} }
} }
...@@ -78,7 +88,7 @@ void Adagrad_Optimizer::Step_4(float* _params, ...@@ -78,7 +88,7 @@ void Adagrad_Optimizer::Step_4(float* _params,
float* grads, float* grads,
float* _exp_avg_sq, float* _exp_avg_sq,
size_t _param_size, size_t _param_size,
__half* dev_params, ds_half_precision_t* dev_params,
bool half_precision) bool half_precision)
{ {
size_t rounded_size = 0; size_t rounded_size = 0;
...@@ -130,7 +140,7 @@ void Adagrad_Optimizer::Step_8(float* _params, ...@@ -130,7 +140,7 @@ void Adagrad_Optimizer::Step_8(float* _params,
float* grads, float* grads,
float* _exp_avg_sq, float* _exp_avg_sq,
size_t _param_size, size_t _param_size,
__half* dev_params, ds_half_precision_t* dev_params,
bool half_precision) bool half_precision)
{ {
size_t rounded_size = 0; size_t rounded_size = 0;
...@@ -170,7 +180,9 @@ int ds_adagrad_step(int optimizer_id, ...@@ -170,7 +180,9 @@ int ds_adagrad_step(int optimizer_id,
opt->update_state(lr, epsilon, weight_decay); opt->update_state(lr, epsilon, weight_decay);
opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0)); opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
#if defined(__ENABLE_CUDA__)
opt->SynchronizeStreams(); opt->SynchronizeStreams();
#endif
return 0; return 0;
} }
...@@ -184,6 +196,7 @@ int ds_adagrad_step_plus_copy(int optimizer_id, ...@@ -184,6 +196,7 @@ int ds_adagrad_step_plus_copy(int optimizer_id,
torch::Tensor& exp_avg_sq, torch::Tensor& exp_avg_sq,
torch::Tensor& gpu_params) torch::Tensor& gpu_params)
{ {
#if defined(__ENABLE_CUDA__)
auto params_c = params.contiguous(); auto params_c = params.contiguous();
auto gpu_params_c = gpu_params.contiguous(); auto gpu_params_c = gpu_params.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous(); auto exp_avg_sq_c = exp_avg_sq.contiguous();
...@@ -191,7 +204,7 @@ int ds_adagrad_step_plus_copy(int optimizer_id, ...@@ -191,7 +204,7 @@ int ds_adagrad_step_plus_copy(int optimizer_id,
float* params_ptr = (float*)params_c.data_ptr(); float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr(); float* grads_ptr = (float*)grads_c.data_ptr();
__half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr(); ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr(); float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
std::shared_ptr<Adagrad_Optimizer> opt = std::shared_ptr<Adagrad_Optimizer> opt =
...@@ -206,6 +219,9 @@ int ds_adagrad_step_plus_copy(int optimizer_id, ...@@ -206,6 +219,9 @@ int ds_adagrad_step_plus_copy(int optimizer_id,
(params.options().dtype() == at::kHalf)); (params.options().dtype() == at::kHalf));
opt->SynchronizeStreams(); opt->SynchronizeStreams();
#else
assert(false);
#endif
return 0; return 0;
} }
......
#include "cpu_adam.h" #include "cpu_adam.h"
#include <cuda_runtime_api.h>
#include <math.h>
#include <omp.h>
#include <torch/extension.h> #include <torch/extension.h>
#include <cassert>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include <type_traits> #include <type_traits>
#include <unordered_map> #include <unordered_map>
#if defined(__ENABLE_CUDA__)
#include <cuda_runtime_api.h>
#include "cublas_v2.h" #include "cublas_v2.h"
#include "cuda.h" #include "cuda.h"
#include "curand.h" #include "curand.h"
#include "custom_cuda_layers.h" #include "custom_cuda_layers.h"
#endif
static std::unordered_map<int, std::shared_ptr<void>> s_optimizers; static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
...@@ -21,7 +23,7 @@ void Adam_Optimizer::Step_1(float* _params, ...@@ -21,7 +23,7 @@ void Adam_Optimizer::Step_1(float* _params,
float* _exp_avg, float* _exp_avg,
float* _exp_avg_sq, float* _exp_avg_sq,
size_t _param_size, size_t _param_size,
__half* dev_params, ds_half_precision_t* dev_params,
bool half_precision) bool half_precision)
{ {
size_t rounded_size = 0; size_t rounded_size = 0;
...@@ -41,19 +43,20 @@ void Adam_Optimizer::Step_1(float* _params, ...@@ -41,19 +43,20 @@ void Adam_Optimizer::Step_1(float* _params,
float step_size = -1 * _alpha / _bias_correction1; float step_size = -1 * _alpha / _bias_correction1;
float w_decay = -1 * _alpha * _weight_decay; float w_decay = -1 * _alpha * _weight_decay;
__half* grads_cast_h; ds_half_precision_t* grads_cast_h;
__half* params_cast_h; ds_half_precision_t* params_cast_h;
if (half_precision) { if (half_precision) {
grads_cast_h = reinterpret_cast<__half*>(grads); grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
params_cast_h = reinterpret_cast<__half*>(_params); params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
} }
for (size_t t = rounded_size; t < _param_size; t += TILE) { for (size_t t = rounded_size; t < _param_size; t += TILE) {
size_t copy_size = TILE; size_t copy_size = TILE;
if ((t + TILE) > _param_size) copy_size = _param_size - t; if ((t + TILE) > _param_size) copy_size = _param_size - t;
size_t offset = copy_size + t; size_t offset = copy_size + t;
#if defined(__ENABLE_CUDA__)
if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); } if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
#endif
#pragma omp parallel for #pragma omp parallel for
for (size_t k = t; k < offset; k++) { for (size_t k = t; k < offset; k++) {
float grad = half_precision ? (float)grads_cast_h[k] : grads[k]; float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
...@@ -73,21 +76,24 @@ void Adam_Optimizer::Step_1(float* _params, ...@@ -73,21 +76,24 @@ void Adam_Optimizer::Step_1(float* _params,
grad = momentum / grad; grad = momentum / grad;
if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; } if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
param = grad * step_size + param; param = grad * step_size + param;
#if defined(__ENABLE_CUDA__)
if (dev_params) _doubled_buffer[_buf_index][k - t] = param; if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
#endif
if (half_precision) if (half_precision)
params_cast_h[k] = (__half)param; params_cast_h[k] = (ds_half_precision_t)param;
else else
_params[k] = param; _params[k] = param;
_exp_avg[k] = momentum; _exp_avg[k] = momentum;
_exp_avg_sq[k] = variance; _exp_avg_sq[k] = variance;
} }
#if defined(__ENABLE_CUDA__)
if (dev_params) { if (dev_params) {
launch_param_update( launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]); _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
_buf_index = !_buf_index; _buf_index = !_buf_index;
} }
#endif
} }
} }
} }
...@@ -97,7 +103,7 @@ void Adam_Optimizer::Step_4(float* _params, ...@@ -97,7 +103,7 @@ void Adam_Optimizer::Step_4(float* _params,
float* _exp_avg, float* _exp_avg,
float* _exp_avg_sq, float* _exp_avg_sq,
size_t _param_size, size_t _param_size,
__half* dev_params, ds_half_precision_t* dev_params,
bool half_precision) bool half_precision)
{ {
size_t rounded_size = 0; size_t rounded_size = 0;
...@@ -166,7 +172,7 @@ void Adam_Optimizer::Step_8(float* _params, ...@@ -166,7 +172,7 @@ void Adam_Optimizer::Step_8(float* _params,
float* _exp_avg, float* _exp_avg,
float* _exp_avg_sq, float* _exp_avg_sq,
size_t _param_size, size_t _param_size,
__half* dev_params, ds_half_precision_t* dev_params,
bool half_precision) bool half_precision)
{ {
size_t rounded_size = 0; size_t rounded_size = 0;
...@@ -228,7 +234,9 @@ int ds_adam_step(int optimizer_id, ...@@ -228,7 +234,9 @@ int ds_adam_step(int optimizer_id,
nullptr, nullptr,
(params.options().dtype() == at::kHalf)); (params.options().dtype() == at::kHalf));
#if defined(__ENABLE_CUDA__)
opt->SynchronizeStreams(); opt->SynchronizeStreams();
#endif
return 0; return 0;
} }
...@@ -246,6 +254,7 @@ int ds_adam_step_plus_copy(int optimizer_id, ...@@ -246,6 +254,7 @@ int ds_adam_step_plus_copy(int optimizer_id,
torch::Tensor& exp_avg_sq, torch::Tensor& exp_avg_sq,
torch::Tensor& gpu_params) torch::Tensor& gpu_params)
{ {
#if defined(__ENABLE_CUDA__)
auto params_c = params.contiguous(); auto params_c = params.contiguous();
auto gpu_params_c = gpu_params.contiguous(); auto gpu_params_c = gpu_params.contiguous();
auto exp_avg_c = exp_avg.contiguous(); auto exp_avg_c = exp_avg.contiguous();
...@@ -254,7 +263,7 @@ int ds_adam_step_plus_copy(int optimizer_id, ...@@ -254,7 +263,7 @@ int ds_adam_step_plus_copy(int optimizer_id,
float* params_ptr = (float*)params_c.data_ptr(); float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr(); float* grads_ptr = (float*)grads_c.data_ptr();
__half* gpu_params_ptr = (__half*)gpu_params_c.data_ptr(); ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr();
float* exp_avg_ptr = (float*)exp_avg_c.data_ptr(); float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr(); float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
...@@ -271,6 +280,9 @@ int ds_adam_step_plus_copy(int optimizer_id, ...@@ -271,6 +280,9 @@ int ds_adam_step_plus_copy(int optimizer_id,
(params.options().dtype() == at::kHalf)); (params.options().dtype() == at::kHalf));
opt->SynchronizeStreams(); opt->SynchronizeStreams();
#else
assert(false);
#endif
return 0; return 0;
} }
......
...@@ -12,7 +12,11 @@ ...@@ -12,7 +12,11 @@
#include <assert.h> #include <assert.h>
#ifdef __HIPCC__
#include "multi_tensor_apply_hip.cuh"
#else
#include "multi_tensor_apply.cuh" #include "multi_tensor_apply.cuh"
#endif
#include "type_shim.h" #include "type_shim.h"
#define BLOCK_SIZE 512 #define BLOCK_SIZE 512
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment