Commit 67ea635f authored by aiss's avatar aiss
Browse files

push dsv0.8.2 version

parent 1b2721ad
Pipeline #201 failed with stages
in 0 seconds
#!/bin/bash
function prep_folder()
{
folder=$1
if [[ -d ${folder} ]]; then
rm -f ${folder}/*
else
mkdir -p ${folder}
fi
}
function validate_environment()
{
validate_cmd="python ./validate_async_io.py"
eval ${validate_cmd}
res=$?
if [[ $res != 0 ]]; then
echo "Failing because environment is not properly configured"
echo "Possible fix: sudo apt-get install libaio-dev"
exit 1
fi
}
validate_environment
if [[ $# -ne 3 ]]; then
echo "Usage: $0 <write size in MB> <write dir ><output log dir>"
exit 1
fi
SIZE="$1M"
WRITE_DIR=$2
LOG_DIR=$3/aio_perf_sweep
OUTPUT_FILE=${WRITE_DIR}/ds_aio_write_${SIZE}B.pt
WRITE_OPT="--write_file ${OUTPUT_FILE} --write_size ${SIZE}"
prep_folder ${WRITE_DIR}
prep_folder ${LOG_DIR}
RUN_SCRIPT=./test_ds_aio.py
DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
SYNC="sync"
for sub in single block; do
if [[ $sub == "single" ]]; then
sub_opt="--single_submit"
else
sub_opt=""
fi
for ov in overlap sequential; do
if [[ $ov == "overlap" ]]; then
ov_opt="--overlap_events"
else
ov_opt=""
fi
for t in 1 2 4 8; do
for p in 1; do
for d in 1 2 4 8 16 32; do
for bs in 128K 256K 512K 1M; do
SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
cmd="python ${RUN_SCRIPT} ${WRITE_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
echo ${DISABLE_CACHE}
echo ${cmd}
echo ${SYNC}
eval ${DISABLE_CACHE}
eval ${cmd}
eval ${SYNC}
sleep 2
done
done
done
done
done
done
{
"block_size": [
"128K",
"256K",
"1M"
],
"queue_depth": [
4,
16,
32
],
"io_parallel": [
1,
2,
4,
8
],
"single_submit": [
true,
false
],
"overlap_events": [
true,
false
],
"threads": [
1
]
}
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import os
import torch
import argparse
import time
import sys
from multiprocessing import Pool
import multiprocessing as mp
from ds_aio_basic import aio_basic_multiprocessing
from ds_aio_handle import aio_handle_multiprocessing
from test_ds_aio_utils import refine_args
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--read_file', type=str, default=None, help='Read file.')
parser.add_argument('--write_file', type=str, default=None, help='Write file.')
parser.add_argument('--write_size',
type=str,
default=None,
help='Number of bytes to write.')
parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
parser.add_argument('--threads',
type=int,
default=1,
help='Thread parallelism count.')
parser.add_argument(
'--single_submit',
action='store_true',
help=
'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
)
parser.add_argument('--overlap_events',
action='store_true',
help='Overlap I/O submission and completion requests.')
parser.add_argument('--validate',
action='store_true',
help='Perform validation in library.')
parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
parser.add_argument('--loops',
type=int,
default=1,
help='Count of operation repetitions')
parser.add_argument('--io_parallel',
type=int,
default=None,
help='Per iop parallelism')
parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
args = parser.parse_args()
print(f'args = {args}')
return args
def validate_args(args):
if args.read_file and not os.path.isfile(args.read_file):
print(f'args validation error: {args.read_file} not found')
return False
return True
def main():
print(f'Testing deepspeed_aio python frontend')
args = parse_arguments()
refine_args(args)
if not validate_args(args):
quit()
mp.set_start_method('spawn')
multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
if args.read_file:
multiprocess_function(args, True)
if args.write_file:
multiprocess_function(args, False)
if __name__ == "__main__":
main()
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import os
BYTES_PER_GB = 1024**3
LOG_TIDS = [0]
def task_log(tid, msg):
if tid in LOG_TIDS:
print(f'tid {tid}: {msg}')
def task_barrier(barrier, num_parties):
assert barrier.parties == num_parties
barrier.wait()
assert barrier.broken == False
def report_results(args, read_op, pool_results):
#print(f'pool_results = {pool_results}')
io_string = 'Read' if read_op else 'Write'
if None in pool_results:
print(f'Failure in one of {args.threads} {io_string} processes')
return
total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
task_latency_sec = max([sec for _, sec, _ in pool_results])
task_speed_GB = total_bytes / task_latency_sec / BYTES_PER_GB
print(f'Task {io_string} Latency = {task_latency_sec} sec')
print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
e2e_latency_sec = max([sec for sec, _, _ in pool_results])
e2e_speed_GB = total_bytes / e2e_latency_sec / BYTES_PER_GB
print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
def refine_integer_value(value):
unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
if value[-1] in list(unit_dict.keys()):
int_value = int(value[:-1]) * unit_dict[value[-1]]
return int_value
return int(value)
def refine_args(args):
if args.write_size and type(args.write_size) == str:
args.write_size = refine_integer_value(args.write_size)
if args.block_size and type(args.block_size) == str:
args.block_size = refine_integer_value(args.block_size)
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import deepspeed
from deepspeed.ops.aio import AsyncIOBuilder
assert AsyncIOBuilder().is_compatible()
#include "custom_cuda_layers.h"
__global__ void param_update_kernel(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < size) { output[id] = (__half)input[id]; }
}
void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
{
int threads = 1024;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
}
__global__ void param_update_kernel_half(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
__half2* output_cast = reinterpret_cast<__half2*>(output);
if (id < size) {
float input_f = input[id];
__half2* input_h = reinterpret_cast<__half2*>(&input_f);
output_cast[id] = *input_h;
}
}
void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream)
{
int threads = 1024;
size /= 2;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
param_update_kernel_half<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
}
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
__global__ void param_update_kernel(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < size) { output[id] = (__half)input[id]; }
}
void launch_param_update(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
__global__ void param_update_kernel_half(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
__half2* output_cast = reinterpret_cast<__half2*>(output);
if (id < size) {
float input_f = input[id];
__half2* input_h = reinterpret_cast<__half2*>(&input_f);
output_cast[id] = *input_h;
}
}
void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
size /= 2;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel_half), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
#pragma once
#ifdef _WIN32
#include <windows.h>
#else
#include <time.h>
#endif
#ifdef _WIN32
class Stopwatch {
private:
double m_total_time;
LARGE_INTEGER m_start_time;
public:
Stopwatch() { m_total_time = 0.0; }
~Stopwatch() {}
void Reset() { m_total_time = 0.0; }
void Start() { QueryPerformanceCounter(&m_start_time); }
void Restart()
{
m_total_time = 0.0;
QueryPerformanceCounter(&m_start_time);
}
void Stop()
{
LARGE_INTEGER frequency;
LARGE_INTEGER stop_time;
QueryPerformanceFrequency(&frequency);
QueryPerformanceCounter(&stop_time);
m_total_time +=
((double)(stop_time.QuadPart - m_start_time.QuadPart) / (double)frequency.QuadPart);
}
double GetTimeInSeconds() { return m_total_time; }
};
#else
class Stopwatch {
private:
double m_total_time;
struct timespec m_start_time;
bool m_is_started;
public:
Stopwatch()
{
m_total_time = 0.0;
m_is_started = false;
}
~Stopwatch() {}
void Reset() { m_total_time = 0.0; }
void Start()
{
clock_gettime(CLOCK_MONOTONIC, &m_start_time);
m_is_started = true;
}
void Restart()
{
m_total_time = 0.0;
clock_gettime(CLOCK_MONOTONIC, &m_start_time);
m_is_started = true;
}
void Stop()
{
if (m_is_started) {
m_is_started = false;
struct timespec end_time;
clock_gettime(CLOCK_MONOTONIC, &end_time);
m_total_time += (double)(end_time.tv_sec - m_start_time.tv_sec) +
(double)(end_time.tv_nsec - m_start_time.tv_nsec) / 1e9;
}
}
double GetTimeInSeconds()
{
if (m_is_started) {
Stop();
Start();
}
return m_total_time;
}
};
#endif
#ifndef __TIMER_H__
#define __TIMER_H__
#include <cuda_runtime.h>
#include <chrono>
#include "cuda.h"
class GPUTimer {
cudaEvent_t start, stop;
public:
GPUTimer()
{
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GPUTimer()
{
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
inline void Record() { cudaEventRecord(start); }
inline void Elapsed(float& time_elapsed)
{
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_elapsed, start, stop);
}
};
class CPUTimer {
std::chrono::high_resolution_clock::time_point start;
public:
CPUTimer() : start(std::chrono::high_resolution_clock::now()) {}
inline void Reset() { start = std::chrono::high_resolution_clock::now(); }
inline float Elapsed()
{
auto temp = start;
start = std::chrono::high_resolution_clock::now();
return (float)(std::chrono::duration_cast<std::chrono::microseconds>(start - temp).count() /
1e3);
}
};
#endif
// !!! This is a file automatically generated by hipify!!!
#ifndef __TIMER_H__
#define __TIMER_H__
#include <hip/hip_runtime.h>
#include <chrono>
#include "hip/hip_runtime.h"
class GPUTimer {
hipEvent_t start, stop;
public:
GPUTimer()
{
hipEventCreate(&start);
hipEventCreate(&stop);
}
~GPUTimer()
{
hipEventDestroy(start);
hipEventDestroy(stop);
}
inline void Record() { hipEventRecord(start); }
inline void Elapsed(float& time_elapsed)
{
hipEventRecord(stop);
hipEventSynchronize(stop);
hipEventElapsedTime(&time_elapsed, start, stop);
}
};
class CPUTimer {
std::chrono::high_resolution_clock::time_point start;
public:
CPUTimer() : start(std::chrono::high_resolution_clock::now()) {}
inline void Reset() { start = std::chrono::high_resolution_clock::now(); }
inline float Elapsed()
{
auto temp = start;
start = std::chrono::high_resolution_clock::now();
return (float)(std::chrono::duration_cast<std::chrono::microseconds>(start - temp).count() /
1e3);
}
};
#endif
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#ifndef TORCH_CHECK
#define TORCH_CHECK AT_CHECK
#endif
#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif
#pragma once
#include <ATen/cuda/CUDAContext.h>
#include <cuda_runtime_api.h>
#include <cassert>
#include <iostream>
#include <vector>
#include "cublas_v2.h"
#include "cuda.h"
#include "curand.h"
#include "gemm_test.h"
#define WARP_SIZE 32
#define CUDA_CHECK(callstr) \
{ \
cudaError_t error_code = callstr; \
if (error_code != cudaSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
#define DS_CUDA_NUM_THREADS 512
#define DS_MAXIMUM_NUM_BLOCKS 262144
inline int DS_GET_BLOCKS(const int N)
{
return (std::max)(
(std::min)((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
// Use at least 1 block, since CUDA does not allow empty block
1);
}
class Context {
public:
Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
{
curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(_gen, 123);
if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
auto message = std::string("Fail to create cublas handle.");
std::cerr << message << std::endl;
throw std::runtime_error(message);
}
}
virtual ~Context()
{
cublasDestroy(_cublasHandle);
cudaFree(_workspace);
}
static Context& Instance()
{
static Context _ctx;
return _ctx;
}
void SetWorkSpace(void* workspace)
{
if (!workspace) { throw std::runtime_error("Workspace is null."); }
_workspace = workspace;
}
void* GetWorkSpace() { return _workspace; }
curandGenerator_t& GetRandGenerator() { return _gen; }
cudaStream_t GetCurrentStream()
{
// get current pytorch stream.
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
return stream;
}
cudaStream_t GetNewStream() { return at::cuda::getStreamFromPool(); }
cublasHandle_t GetCublasHandle() { return _cublasHandle; }
std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
{
uint64_t offset = _curr_offset;
_curr_offset += offset_inc;
return std::pair<uint64_t, uint64_t>(_seed, offset);
}
void SetSeed(uint64_t new_seed) { _seed = new_seed; }
void TestGemmFP16(bool test_gemm, int batch_size, int seq_len, int head_num, int size_per_head)
{
// avoid rerun.
if (_gemm_algos.size() > 0) return;
if (test_gemm) {
cublasHandle_t handle = GetCublasHandle();
std::unique_ptr<GemmTest<__half>> test_qkv_fw(
new GemmTest<__half>(batch_size * seq_len, // M
head_num * size_per_head, // N
head_num * size_per_head, // K
CUBLAS_OP_T,
CUBLAS_OP_N,
handle));
std::unique_ptr<GemmTest<__half>> test_inter(
new GemmTest<__half>(batch_size * seq_len, // M
4 * head_num * size_per_head, // N
head_num * size_per_head, // K
CUBLAS_OP_T,
CUBLAS_OP_N,
handle));
std::unique_ptr<GemmTest<__half>> test_output(
new GemmTest<__half>(batch_size * seq_len, // M
head_num * size_per_head, // N
4 * head_num * size_per_head, // K
CUBLAS_OP_T,
CUBLAS_OP_N,
handle));
std::unique_ptr<StridedGemmTest<__half>> test_attn_scores(
new StridedGemmTest<__half>(batch_size * head_num, // batch
seq_len, // M
seq_len, // N
size_per_head, // K
CUBLAS_OP_T,
CUBLAS_OP_N,
handle));
std::unique_ptr<StridedGemmTest<__half>> test_attn_context(
new StridedGemmTest<__half>(batch_size * head_num, // batch
size_per_head, // M
seq_len, // N
seq_len, // K
CUBLAS_OP_N,
CUBLAS_OP_N,
handle));
_gemm_algos.push_back(test_qkv_fw->TestAlgo(100));
_gemm_algos.push_back(test_inter->TestAlgo(100));
_gemm_algos.push_back(test_output->TestAlgo(100));
_gemm_algos.push_back(test_attn_scores->TestAlgo(100));
_gemm_algos.push_back(test_attn_context->TestAlgo(100));
} else {
// Use default algo.
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
}
}
const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
private:
curandGenerator_t _gen;
cublasHandle_t _cublasHandle;
void* _workspace;
uint64_t _seed;
uint64_t _curr_offset;
std::vector<std::array<int, 3>> _gemm_algos;
};
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <ATen/hip/HIPContext.h>
#include <hip/hip_runtime_api.h>
#include <cassert>
#include <iostream>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand/hiprand.h"
#include "gemm_test_hip.h"
#define WARP_SIZE 32
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
#define DS_CUDA_NUM_THREADS 512
#define DS_MAXIMUM_NUM_BLOCKS 262144
inline int DS_GET_BLOCKS(const int N)
{
return (std::max)(
(std::min)((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
// Use at least 1 block, since CUDA does not allow empty block
1);
}
class Context {
public:
Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
{
hiprandCreateGenerator(&_gen, HIPRAND_RNG_PSEUDO_DEFAULT);
hiprandSetPseudoRandomGeneratorSeed(_gen, 123);
if (rocblas_create_handle(&_cublasHandle) != rocblas_status_success) {
auto message = std::string("Fail to create cublas handle.");
std::cerr << message << std::endl;
throw std::runtime_error(message);
}
}
virtual ~Context()
{
rocblas_destroy_handle(_cublasHandle);
hipFree(_workspace);
}
static Context& Instance()
{
static Context _ctx;
return _ctx;
}
void SetWorkSpace(void* workspace)
{
if (!workspace) { throw std::runtime_error("Workspace is null."); }
_workspace = workspace;
}
void* GetWorkSpace() { return _workspace; }
hiprandGenerator_t& GetRandGenerator() { return _gen; }
hipStream_t GetCurrentStream()
{
// get current pytorch stream.
hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
return stream;
}
hipStream_t GetNewStream() { return at::hip::getStreamFromPoolMasqueradingAsCUDA(); }
rocblas_handle GetCublasHandle() { return _cublasHandle; }
std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
{
uint64_t offset = _curr_offset;
_curr_offset += offset_inc;
return std::pair<uint64_t, uint64_t>(_seed, offset);
}
void SetSeed(uint64_t new_seed) { _seed = new_seed; }
void TestGemmFP16(bool test_gemm, int batch_size, int seq_len, int head_num, int size_per_head)
{
// avoid rerun.
if (_gemm_algos.size() > 0) return;
if (test_gemm) {
rocblas_handle handle = GetCublasHandle();
std::unique_ptr<GemmTest<__half>> test_qkv_fw(
new GemmTest<__half>(batch_size * seq_len, // M
head_num * size_per_head, // N
head_num * size_per_head, // K
rocblas_operation_transpose,
rocblas_operation_none,
handle));
std::unique_ptr<GemmTest<__half>> test_inter(
new GemmTest<__half>(batch_size * seq_len, // M
4 * head_num * size_per_head, // N
head_num * size_per_head, // K
rocblas_operation_transpose,
rocblas_operation_none,
handle));
std::unique_ptr<GemmTest<__half>> test_output(
new GemmTest<__half>(batch_size * seq_len, // M
head_num * size_per_head, // N
4 * head_num * size_per_head, // K
rocblas_operation_transpose,
rocblas_operation_none,
handle));
std::unique_ptr<StridedGemmTest<__half>> test_attn_scores(
new StridedGemmTest<__half>(batch_size * head_num, // batch
seq_len, // M
seq_len, // N
size_per_head, // K
rocblas_operation_transpose,
rocblas_operation_none,
handle));
std::unique_ptr<StridedGemmTest<__half>> test_attn_context(
new StridedGemmTest<__half>(batch_size * head_num, // batch
size_per_head, // M
seq_len, // N
seq_len, // K
rocblas_operation_none,
rocblas_operation_none,
handle));
_gemm_algos.push_back(test_qkv_fw->TestAlgo(100));
_gemm_algos.push_back(test_inter->TestAlgo(100));
_gemm_algos.push_back(test_output->TestAlgo(100));
_gemm_algos.push_back(test_attn_scores->TestAlgo(100));
_gemm_algos.push_back(test_attn_context->TestAlgo(100));
} else {
// Use default algo.
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
_gemm_algos.push_back(std::array<int, 3>({99, 99, 99}));
}
}
const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
private:
hiprandGenerator_t _gen;
rocblas_handle _cublasHandle;
void* _workspace;
uint64_t _seed;
uint64_t _curr_offset;
std::vector<std::array<int, 3>> _gemm_algos;
};
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "cuda.h"
#include "custom_cuda_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class Adagrad_Optimizer {
public:
Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
: _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
{
cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
_streams[0] = Context::Instance().GetCurrentStream();
_streams[1] = Context::Instance().GetNewStream();
}
~Adagrad_Optimizer()
{
cudaFreeHost(_doubled_buffer[0]);
cudaFreeHost(_doubled_buffer[1]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg_sq,
size_t param_size,
__half* dev_param = nullptr,
bool half_precision = false);
#endif
STEP(1)
STEP(4)
STEP(8)
inline void SynchronizeStreams()
{
for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
}
inline void IncrementStep(size_t step)
{
_step++;
if (_step != step) { _step = step; }
}
inline void update_state(float lr, float epsilon, float weight_decay)
{
_alpha = lr;
_eps = epsilon;
_weight_decay = weight_decay;
}
private:
float _alpha;
float _eps;
float _weight_decay;
float _betta1_t;
float _betta2_t;
size_t _step;
float* _doubled_buffer[2];
bool _buf_index;
cudaStream_t _streams[2];
};
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t new_rounded_size = 0;
AVX_Data eps_4;
eps_4.data = SIMD_SET(_eps);
float step_size = -1 * _alpha;
AVX_Data step_size_4;
step_size_4.data = SIMD_SET(step_size);
AVX_Data weight_decay4;
if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
for (size_t t = 0; t < new_rounded_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
AVX_Data grad_4[span];
simd_load<span>(grad_4, grads + i, half_precision);
AVX_Data momentum_4[span];
simd_load<span>(momentum_4, grads + i, false);
AVX_Data variance_4[span];
simd_load<span>(variance_4, _exp_avg_sq + i, false);
AVX_Data param_4[span];
simd_load<span>(param_4, _params + i, half_precision);
if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
simd_sqrt<span>(grad_4, variance_4);
simd_add<span>(grad_4, grad_4, eps_4);
simd_div<span>(grad_4, momentum_4, grad_4);
simd_fma<span>(param_4, grad_4, step_size_4, param_4);
simd_store<span>(_params + i, param_4, half_precision);
if (dev_params) {
simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
}
simd_store<span>(_exp_avg_sq + i, variance_4, false);
}
if (dev_params) {
if (half_precision)
launch_param_update_half(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
else
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
*rounded_size = new_rounded_size;
}
#endif
// !!! This is a file automatically generated by hipify!!!
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <hip/hip_fp16.h>
#include <hip/hip_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class Adagrad_Optimizer {
public:
Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
: _alpha(alpha), _eps(eps), _weight_decay(weight_decay), _buf_index(false)
{
hipHostMalloc((void**)_doubled_buffer, TILE * sizeof(float));
hipHostMalloc((void**)(_doubled_buffer + 1), TILE * sizeof(float));
_streams[0] = Context::Instance().GetCurrentStream();
_streams[1] = Context::Instance().GetNewStream();
}
~Adagrad_Optimizer()
{
hipHostFree(_doubled_buffer[0]);
hipHostFree(_doubled_buffer[1]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg_sq,
size_t param_size,
__half* dev_param = nullptr,
bool half_precision = false);
#endif
STEP(1)
STEP(4)
STEP(8)
inline void SynchronizeStreams()
{
for (int i = 0; i < 2; i++) hipStreamSynchronize(_streams[i]);
}
inline void IncrementStep(size_t step)
{
_step++;
if (_step != step) { _step = step; }
}
inline void update_state(float lr, float epsilon, float weight_decay)
{
_alpha = lr;
_eps = epsilon;
_weight_decay = weight_decay;
}
private:
float _alpha;
float _eps;
float _weight_decay;
float _betta1_t;
float _betta2_t;
size_t _step;
float* _doubled_buffer[2];
bool _buf_index;
hipStream_t _streams[2];
};
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t new_rounded_size = 0;
AVX_Data eps_4;
eps_4.data = SIMD_SET(_eps);
float step_size = -1 * _alpha;
AVX_Data step_size_4;
step_size_4.data = SIMD_SET(step_size);
AVX_Data weight_decay4;
if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
for (size_t t = 0; t < new_rounded_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
AVX_Data grad_4[span];
simd_load<span>(grad_4, grads + i, half_precision);
AVX_Data momentum_4[span];
simd_load<span>(momentum_4, grads + i, false);
AVX_Data variance_4[span];
simd_load<span>(variance_4, _exp_avg_sq + i, false);
AVX_Data param_4[span];
simd_load<span>(param_4, _params + i, half_precision);
if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
simd_sqrt<span>(grad_4, variance_4);
simd_add<span>(grad_4, grad_4, eps_4);
simd_div<span>(grad_4, momentum_4, grad_4);
simd_fma<span>(param_4, grad_4, step_size_4, param_4);
simd_store<span>(_params + i, param_4, half_precision);
if (dev_params) {
simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
}
simd_store<span>(_exp_avg_sq + i, variance_4, false);
}
if (dev_params) {
if (half_precision)
launch_param_update_half(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
else
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
*rounded_size = new_rounded_size;
}
#endif
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "cuda.h"
#include "custom_cuda_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class Adam_Optimizer {
public:
Adam_Optimizer(float alpha = 1e-3,
float betta1 = 0.9,
float betta2 = 0.999,
float eps = 1e-8,
float weight_decay = 0,
bool adamw_mode = true)
: _alpha(alpha),
_betta1(betta1),
_betta2(betta2),
_eps(eps),
_weight_decay(weight_decay),
_betta1_t(1.0),
_betta2_t(1.0),
_step(0),
_buf_index(false),
_adamw_mode(adamw_mode)
{
cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
_streams[0] = Context::Instance().GetCurrentStream();
_streams[1] = Context::Instance().GetNewStream();
}
~Adam_Optimizer()
{
cudaFreeHost(_doubled_buffer[0]);
cudaFreeHost(_doubled_buffer[1]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t param_size,
__half* dev_param = nullptr,
bool half_precision = false);
#endif
STEP(1)
STEP(4)
STEP(8)
inline void SynchronizeStreams()
{
for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
}
inline void IncrementStep(size_t step, float beta1, float beta2)
{
if (beta1 != _betta1 || beta2 != _betta2) {
_step = step;
_betta1 = beta1;
_betta2 = beta2;
_betta1_t = std::pow(_betta1, step);
_betta2_t = std::pow(_betta2, step);
} else {
_step++;
if (_step != step) {
_betta1_t = std::pow(_betta1, step);
_betta2_t = std::pow(_betta2, step);
_step = step;
} else {
_betta1_t *= _betta1;
_betta2_t *= _betta2;
}
}
}
inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
{
_alpha = lr;
_eps = epsilon;
_weight_decay = weight_decay;
_bias_correction1 = 1.0f;
_bias_correction2 = 1.0f;
if (bias_correction == 1) {
_bias_correction1 = 1 - _betta1_t;
_bias_correction2 = 1 / sqrt(1 - _betta2_t);
}
}
private:
float _alpha;
float _betta1;
float _betta2;
float _eps;
float _weight_decay;
float _betta1_t;
float _betta2_t;
size_t _step;
float _bias_correction1;
float _bias_correction2;
float* _doubled_buffer[2];
bool _buf_index;
bool _adamw_mode;
cudaStream_t _streams[2];
};
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Adam_Optimizer::Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t new_rounded_size = 0;
AVX_Data betta1_4;
betta1_4.data = SIMD_SET(_betta1);
AVX_Data betta2_4;
betta2_4.data = SIMD_SET(_betta2);
float betta1_minus1 = 1 - _betta1;
float betta2_minus1 = 1 - _betta2;
AVX_Data betta1_minus1_4;
betta1_minus1_4.data = SIMD_SET(betta1_minus1);
AVX_Data betta2_minus1_4;
betta2_minus1_4.data = SIMD_SET(betta2_minus1);
AVX_Data bias2_sqrt;
bias2_sqrt.data = SIMD_SET(_bias_correction2);
AVX_Data eps_4;
eps_4.data = SIMD_SET(_eps);
float step_size = -1 * _alpha / _bias_correction1;
AVX_Data step_size_4;
step_size_4.data = SIMD_SET(step_size);
float w_decay = -1 * _alpha * _weight_decay;
AVX_Data weight_decay4;
if (_weight_decay > 0)
weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
for (size_t t = 0; t < new_rounded_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
AVX_Data grad_4[span];
simd_load<span>(grad_4, grads + i, half_precision);
AVX_Data momentum_4[span];
simd_load<span>(momentum_4, _exp_avg + i, false);
AVX_Data variance_4[span];
simd_load<span>(variance_4, _exp_avg_sq + i, false);
AVX_Data param_4[span];
simd_load<span>(param_4, _params + i, half_precision);
if (_weight_decay > 0 && !_adamw_mode) {
simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
}
simd_mul<span>(momentum_4, momentum_4, betta1_4);
simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
simd_mul<span>(variance_4, variance_4, betta2_4);
simd_mul<span>(grad_4, grad_4, grad_4);
simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
simd_sqrt<span>(grad_4, variance_4);
simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
simd_div<span>(grad_4, momentum_4, grad_4);
if (_weight_decay > 0 && _adamw_mode) {
simd_fma<span>(param_4, param_4, weight_decay4, param_4);
}
simd_fma<span>(param_4, grad_4, step_size_4, param_4);
simd_store<span>(_params + i, param_4, half_precision);
if (dev_params) {
simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
}
simd_store<span>(_exp_avg + i, momentum_4, false);
simd_store<span>(_exp_avg_sq + i, variance_4, false);
}
if (dev_params) {
if (half_precision)
launch_param_update_half(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
else
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
*rounded_size = new_rounded_size;
}
#endif
// !!! This is a file automatically generated by hipify!!!
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <hip/hip_fp16.h>
#include <hip/hip_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class Adam_Optimizer {
public:
Adam_Optimizer(float alpha = 1e-3,
float betta1 = 0.9,
float betta2 = 0.999,
float eps = 1e-8,
float weight_decay = 0,
bool adamw_mode = true)
: _alpha(alpha),
_betta1(betta1),
_betta2(betta2),
_eps(eps),
_weight_decay(weight_decay),
_betta1_t(1.0),
_betta2_t(1.0),
_step(0),
_buf_index(false),
_adamw_mode(adamw_mode)
{
hipHostMalloc((void**)_doubled_buffer, TILE * sizeof(float));
hipHostMalloc((void**)(_doubled_buffer + 1), TILE * sizeof(float));
_streams[0] = Context::Instance().GetCurrentStream();
_streams[1] = Context::Instance().GetNewStream();
}
~Adam_Optimizer()
{
hipHostFree(_doubled_buffer[0]);
hipHostFree(_doubled_buffer[1]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t param_size,
__half* dev_param = nullptr,
bool half_precision = false);
#endif
STEP(1)
STEP(4)
STEP(8)
inline void SynchronizeStreams()
{
for (int i = 0; i < 2; i++) hipStreamSynchronize(_streams[i]);
}
inline void IncrementStep(size_t step, float beta1, float beta2)
{
if (beta1 != _betta1 || beta2 != _betta2) {
_step = step;
_betta1 = beta1;
_betta2 = beta2;
_betta1_t = std::pow(_betta1, step);
_betta2_t = std::pow(_betta2, step);
} else {
_step++;
if (_step != step) {
_betta1_t = std::pow(_betta1, step);
_betta2_t = std::pow(_betta2, step);
_step = step;
} else {
_betta1_t *= _betta1;
_betta2_t *= _betta2;
}
}
}
inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
{
_alpha = lr;
_eps = epsilon;
_weight_decay = weight_decay;
_bias_correction1 = 1.0f;
_bias_correction2 = 1.0f;
if (bias_correction == 1) {
_bias_correction1 = 1 - _betta1_t;
_bias_correction2 = 1 / sqrt(1 - _betta2_t);
}
}
private:
float _alpha;
float _betta1;
float _betta2;
float _eps;
float _weight_decay;
float _betta1_t;
float _betta2_t;
size_t _step;
float _bias_correction1;
float _bias_correction2;
float* _doubled_buffer[2];
bool _buf_index;
bool _adamw_mode;
hipStream_t _streams[2];
};
#if defined(__AVX512__) or defined(__AVX256__)
template <int span>
void Adam_Optimizer::Step_AVX(size_t* rounded_size,
float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
__half* dev_params,
bool half_precision)
{
size_t new_rounded_size = 0;
AVX_Data betta1_4;
betta1_4.data = SIMD_SET(_betta1);
AVX_Data betta2_4;
betta2_4.data = SIMD_SET(_betta2);
float betta1_minus1 = 1 - _betta1;
float betta2_minus1 = 1 - _betta2;
AVX_Data betta1_minus1_4;
betta1_minus1_4.data = SIMD_SET(betta1_minus1);
AVX_Data betta2_minus1_4;
betta2_minus1_4.data = SIMD_SET(betta2_minus1);
AVX_Data bias2_sqrt;
bias2_sqrt.data = SIMD_SET(_bias_correction2);
AVX_Data eps_4;
eps_4.data = SIMD_SET(_eps);
float step_size = -1 * _alpha / _bias_correction1;
AVX_Data step_size_4;
step_size_4.data = SIMD_SET(step_size);
float w_decay = -1 * _alpha * _weight_decay;
AVX_Data weight_decay4;
if (_weight_decay > 0)
weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
for (size_t t = 0; t < new_rounded_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
size_t offset = copy_size + t;
if ((t / TILE) >= 2) { hipStreamSynchronize(_streams[_buf_index]); }
#pragma omp parallel for
for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
AVX_Data grad_4[span];
simd_load<span>(grad_4, grads + i, half_precision);
AVX_Data momentum_4[span];
simd_load<span>(momentum_4, _exp_avg + i, false);
AVX_Data variance_4[span];
simd_load<span>(variance_4, _exp_avg_sq + i, false);
AVX_Data param_4[span];
simd_load<span>(param_4, _params + i, half_precision);
if (_weight_decay > 0 && !_adamw_mode) {
simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
}
simd_mul<span>(momentum_4, momentum_4, betta1_4);
simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
simd_mul<span>(variance_4, variance_4, betta2_4);
simd_mul<span>(grad_4, grad_4, grad_4);
simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
simd_sqrt<span>(grad_4, variance_4);
simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
simd_div<span>(grad_4, momentum_4, grad_4);
if (_weight_decay > 0 && _adamw_mode) {
simd_fma<span>(param_4, param_4, weight_decay4, param_4);
}
simd_fma<span>(param_4, grad_4, step_size_4, param_4);
simd_store<span>(_params + i, param_4, half_precision);
if (dev_params) {
simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
}
simd_store<span>(_exp_avg + i, momentum_4, false);
simd_store<span>(_exp_avg_sq + i, variance_4, false);
}
if (dev_params) {
if (half_precision)
launch_param_update_half(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
else
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
_buf_index = !_buf_index;
}
}
*rounded_size = new_rounded_size;
}
#endif
#pragma once
#include <assert.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#ifndef __HIP_PLATFORM_HCC__
#include <mma.h>
#endif
#include <stdio.h>
int cublas_gemm_ex(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const float* A,
const float* B,
float* C,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
#endif
int cublas_gemm_ex(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
#endif
int cublas_strided_batched_gemm(cublasHandle_t handle,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const float* A,
const float* B,
float* C,
cublasOperation_t op_A,
cublasOperation_t op_B,
int stride_A,
int stride_B,
int stride_C,
int batch,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
#endif
int cublas_strided_batched_gemm(cublasHandle_t handle,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
cublasOperation_t op_A,
cublasOperation_t op_B,
int stride_A,
int stride_B,
int stride_C,
int batch,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
#endif
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <assert.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#ifndef __HIP_PLATFORM_HCC__
#include <mma.h>
#endif
#include <stdio.h>
int cublas_gemm_ex(rocblas_handle handle,
rocblas_operation transa,
rocblas_operation transb,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const float* A,
const float* B,
float* C,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
#endif
int cublas_gemm_ex(rocblas_handle handle,
rocblas_operation transa,
rocblas_operation transb,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
#endif
int cublas_strided_batched_gemm(rocblas_handle handle,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const float* A,
const float* B,
float* C,
rocblas_operation op_A,
rocblas_operation op_B,
int stride_A,
int stride_B,
int stride_C,
int batch,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
#endif
int cublas_strided_batched_gemm(rocblas_handle handle,
int m,
int n,
int k,
const float* alpha,
const float* beta,
const __half* A,
const __half* B,
__half* C,
rocblas_operation op_A,
rocblas_operation op_B,
int stride_A,
int stride_B,
int stride_C,
int batch,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
#endif
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <curand_kernel.h>
#include "context.h"
#include "cublas_wrappers.h"
#define CUDA_CHECK(callstr) \
{ \
cudaError_t error_code = callstr; \
if (error_code != cudaSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define MAX_THREADS 1024
#define THREADS 256
#define MAX_THREAD_STRIDE 32
#define TILE_DIM 32
// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
#define MAX_THREAD_ITERATIONS 8 // Maximum 8K
#define MAX_WARP_NUM 32
#define MAX_REGISTERS 256
#define MAX_REG 256
#define WARP_SIZE_BITS 5
template <typename T>
void launch_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template <typename T>
void launch_sr_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template <typename T>
void launch_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template <typename T>
void launch_sr_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
// Fused bias add with gelu activation
template <typename T>
void launch_bias_gelu(const T* input,
const T* bias,
T* output,
int intermediate_size,
int batch_size,
cudaStream_t stream);
template <typename T>
void launch_gelu(const T* input,
T* output,
int intermediate_size,
int batch_size,
cudaStream_t stream);
template <typename T>
void launch_d_gelu(T* d_output,
const T* input,
const T* bias,
int intermediate_size,
int batch_size,
cudaStream_t stream);
// Custom fused bias add with layer normalization
template <typename T>
void launch_bias_residual_layer_norm(T* vals,
const T* residual,
const T* gamma,
const T* beta,
float epsilon,
int batch_size,
int hidden_dim,
cudaStream_t stream,
bool preLayerNorm,
bool training,
T* vars,
T* means);
template <typename T>
void launch_bias_residual_layer_norm(T* vals,
const T* residual,
const T* gamma,
const T* beta,
float epsilon,
int batch_size,
int hidden_dim,
cudaStream_t stream,
bool preLayerNorm,
bool training,
T* vars);
template <typename T>
void launch_layerNorm_backward_fused_add(const T* out_grad1,
const T* out_grad2,
const T* X_data,
const T* vars,
const T* means,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
cudaStream_t stream[2]);
template <typename T>
void launch_layerNorm_backward_fused_add(const T* out_grad1,
const T* out_grad2,
const T* vals_hat,
const T* vars,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
cudaStream_t stream[2],
bool invertible = false,
const T* betta = nullptr);
template <typename T>
void launch_layerNorm_backward(const T* out_grad,
const T* X_data,
const T* vars,
const T* means,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
cudaStream_t stream[2]);
template <typename T>
void launch_layerNorm_backward(const T* out_grad,
const T* vals_hat,
const T* vars,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
cudaStream_t stream[2],
bool invertible = false,
const T* betta = nullptr);
template <typename T>
void launch_layerNorm_backward_nreversible(const T* out_grad,
const T* vals,
const T* out_grad_trans,
const T* vals_trans,
const T* means,
const T* vars,
const T* gamma,
T* gamma_grad,
T* betta_grad,
T* inp_grad,
int batch_size,
int hidden_dim,
cudaStream_t stream[2]);
template <typename T>
void Transpose(const T* inp_mat, T* out_mat, int rows, int cols, cudaStream_t stream);
template <typename T>
void launch_attn_softmax_backward(T* out_grad,
const T* soft_inp,
int batch_size,
int heads,
int seq_length,
cudaStream_t stream);
template <typename T>
void launch_attn_softmax_backward_v2(T* out_grad,
const T* soft_inp,
int batch_size,
int heads,
int seq_length,
cudaStream_t stream);
// Custom softmax with scaling and attention mask addition
template <typename T>
void launch_attn_softmax(T* vals,
const T* attn_mask,
int batch_size,
int heads,
int sequence_length,
cudaStream_t stream);
template <typename T>
void launch_transform_0213(T* output,
const T* vals,
int batch_size,
int seq_length,
int hidden_dim,
int heads,
cudaStream_t stream);
// Custom bias add
template <typename T>
void launch_bias_add_transform_0213(T* outputs,
const T* vals,
const T* bias,
int batch_size,
int seq_length,
int hidden_dim,
int heads,
cudaStream_t stream,
int trans_count);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template <typename T>
void launch_transform4d_0213(T* out,
const T* in,
int batch_size,
int heads,
int seq_length,
int hidden_dim,
cudaStream_t stream,
int trans_count);
template <typename T>
void launch_dropout(T* vals,
const T* bias,
uint8_t* mask,
int batch,
int dim,
float ratio,
cudaStream_t stream);
template <typename T>
void launch_dropout(T* vals_out,
const T* vals,
uint8_t* mask,
int total_count,
int dim,
float ratio,
cudaStream_t stream,
bool bwd = false);
template <typename T>
void launch_dropout(T* out,
const T* vals,
const T* residual,
const T* bias,
uint8_t* mask,
int batch,
int dim,
float ratio,
cudaStream_t stream);
template <typename T>
void launch_dropout_grad(T* vals, uint8_t* mask, int total_count, float ratio, cudaStream_t stream);
template <typename T>
void launch_dropout_grad(T* vals_out,
const T* vals,
uint8_t* mask,
int total_count,
float ratio,
cudaStream_t stream);
template <typename T>
void launch_fuse_transpose_bias_kernel(const T* inp,
T* out,
int rows,
int cols,
cudaStream_t stream);
void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream);
void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment