Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
67ea635f
Commit
67ea635f
authored
Mar 30, 2023
by
aiss
Browse files
push dsv0.8.2 version
parent
1b2721ad
Pipeline
#201
failed with stages
in 0 seconds
Changes
341
Pipelines
2
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2140 deletions
+0
-2140
deepspeed/ops/csrc/aio/py_test/run_write_sweep.sh
deepspeed/ops/csrc/aio/py_test/run_write_sweep.sh
+0
-82
deepspeed/ops/csrc/aio/py_test/single_process_config.json
deepspeed/ops/csrc/aio/py_test/single_process_config.json
+0
-29
deepspeed/ops/csrc/aio/py_test/test_ds_aio.py
deepspeed/ops/csrc/aio/py_test/test_ds_aio.py
+0
-101
deepspeed/ops/csrc/aio/py_test/test_ds_aio_utils.py
deepspeed/ops/csrc/aio/py_test/test_ds_aio_utils.py
+0
-59
deepspeed/ops/csrc/aio/py_test/validate_async_io.py
deepspeed/ops/csrc/aio/py_test/validate_async_io.py
+0
-9
deepspeed/ops/csrc/common/custom_cuda_kernel.cu
deepspeed/ops/csrc/common/custom_cuda_kernel.cu
+0
-39
deepspeed/ops/csrc/common/custom_hip_kernel.hip
deepspeed/ops/csrc/common/custom_hip_kernel.hip
+0
-41
deepspeed/ops/csrc/includes/StopWatch.h
deepspeed/ops/csrc/includes/StopWatch.h
+0
-98
deepspeed/ops/csrc/includes/Timer.h
deepspeed/ops/csrc/includes/Timer.h
+0
-47
deepspeed/ops/csrc/includes/Timer_hip.h
deepspeed/ops/csrc/includes/Timer_hip.h
+0
-48
deepspeed/ops/csrc/includes/compat.h
deepspeed/ops/csrc/includes/compat.h
+0
-14
deepspeed/ops/csrc/includes/context.h
deepspeed/ops/csrc/includes/context.h
+0
-171
deepspeed/ops/csrc/includes/context_hip.h
deepspeed/ops/csrc/includes/context_hip.h
+0
-172
deepspeed/ops/csrc/includes/cpu_adagrad.h
deepspeed/ops/csrc/includes/cpu_adagrad.h
+0
-150
deepspeed/ops/csrc/includes/cpu_adagrad_hip.h
deepspeed/ops/csrc/includes/cpu_adagrad_hip.h
+0
-151
deepspeed/ops/csrc/includes/cpu_adam.h
deepspeed/ops/csrc/includes/cpu_adam.h
+0
-225
deepspeed/ops/csrc/includes/cpu_adam_hip.h
deepspeed/ops/csrc/includes/cpu_adam_hip.h
+0
-226
deepspeed/ops/csrc/includes/cublas_wrappers.h
deepspeed/ops/csrc/includes/cublas_wrappers.h
+0
-87
deepspeed/ops/csrc/includes/cublas_wrappers_hip.h
deepspeed/ops/csrc/includes/cublas_wrappers_hip.h
+0
-88
deepspeed/ops/csrc/includes/custom_cuda_layers.h
deepspeed/ops/csrc/includes/custom_cuda_layers.h
+0
-303
No files found.
Too many changes to show.
To preserve performance only
341 of 341+
files are displayed.
Plain diff
Email patch
deepspeed/ops/csrc/aio/py_test/run_write_sweep.sh
deleted
100644 → 0
View file @
1b2721ad
#!/bin/bash
function
prep_folder
()
{
folder
=
$1
if
[[
-d
${
folder
}
]]
;
then
rm
-f
${
folder
}
/
*
else
mkdir
-p
${
folder
}
fi
}
function
validate_environment
()
{
validate_cmd
=
"python ./validate_async_io.py"
eval
${
validate_cmd
}
res
=
$?
if
[[
$res
!=
0
]]
;
then
echo
"Failing because environment is not properly configured"
echo
"Possible fix: sudo apt-get install libaio-dev"
exit
1
fi
}
validate_environment
if
[[
$#
-ne
3
]]
;
then
echo
"Usage:
$0
<write size in MB> <write dir ><output log dir>"
exit
1
fi
SIZE
=
"
$1M
"
WRITE_DIR
=
$2
LOG_DIR
=
$3
/aio_perf_sweep
OUTPUT_FILE
=
${
WRITE_DIR
}
/ds_aio_write_
${
SIZE
}
B.pt
WRITE_OPT
=
"--write_file
${
OUTPUT_FILE
}
--write_size
${
SIZE
}
"
prep_folder
${
WRITE_DIR
}
prep_folder
${
LOG_DIR
}
RUN_SCRIPT
=
./test_ds_aio.py
DISABLE_CACHE
=
"sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
SYNC
=
"sync"
for
sub
in
single block
;
do
if
[[
$sub
==
"single"
]]
;
then
sub_opt
=
"--single_submit"
else
sub_opt
=
""
fi
for
ov
in
overlap sequential
;
do
if
[[
$ov
==
"overlap"
]]
;
then
ov_opt
=
"--overlap_events"
else
ov_opt
=
""
fi
for
t
in
1 2 4 8
;
do
for
p
in
1
;
do
for
d
in
1 2 4 8 16 32
;
do
for
bs
in
128K 256K 512K 1M
;
do
SCHED_OPTS
=
"
${
sub_opt
}
${
ov_opt
}
--handle --threads
${
t
}
"
OPTS
=
"--io_parallel
${
p
}
--queue_depth
${
d
}
--block_size
${
bs
}
"
LOG
=
"
${
LOG_DIR
}
/write_
${
sub
}
_
${
ov
}
_t
${
t
}
_p
${
p
}
_d
${
d
}
_bs
${
bs
}
.txt"
cmd
=
"python
${
RUN_SCRIPT
}
${
WRITE_OPT
}
${
OPTS
}
${
SCHED_OPTS
}
&>
${
LOG
}
"
echo
${
DISABLE_CACHE
}
echo
${
cmd
}
echo
${
SYNC
}
eval
${
DISABLE_CACHE
}
eval
${
cmd
}
eval
${
SYNC
}
sleep
2
done
done
done
done
done
done
deepspeed/ops/csrc/aio/py_test/single_process_config.json
deleted
100644 → 0
View file @
1b2721ad
{
"block_size"
:
[
"128K"
,
"256K"
,
"1M"
],
"queue_depth"
:
[
4
,
16
,
32
],
"io_parallel"
:
[
1
,
2
,
4
,
8
],
"single_submit"
:
[
true
,
false
],
"overlap_events"
:
[
true
,
false
],
"threads"
:
[
1
]
}
deepspeed/ops/csrc/aio/py_test/test_ds_aio.py
deleted
100644 → 0
View file @
1b2721ad
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
import
torch
import
argparse
import
time
import
sys
from
multiprocessing
import
Pool
import
multiprocessing
as
mp
from
ds_aio_basic
import
aio_basic_multiprocessing
from
ds_aio_handle
import
aio_handle_multiprocessing
from
test_ds_aio_utils
import
refine_args
def
parse_arguments
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--read_file'
,
type
=
str
,
default
=
None
,
help
=
'Read file.'
)
parser
.
add_argument
(
'--write_file'
,
type
=
str
,
default
=
None
,
help
=
'Write file.'
)
parser
.
add_argument
(
'--write_size'
,
type
=
str
,
default
=
None
,
help
=
'Number of bytes to write.'
)
parser
.
add_argument
(
'--block_size'
,
type
=
str
,
default
=
'1M'
,
help
=
'I/O block size.'
)
parser
.
add_argument
(
'--queue_depth'
,
type
=
int
,
default
=
32
,
help
=
'I/O queue depth.'
)
parser
.
add_argument
(
'--threads'
,
type
=
int
,
default
=
1
,
help
=
'Thread parallelism count.'
)
parser
.
add_argument
(
'--single_submit'
,
action
=
'store_true'
,
help
=
'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
)
parser
.
add_argument
(
'--overlap_events'
,
action
=
'store_true'
,
help
=
'Overlap I/O submission and completion requests.'
)
parser
.
add_argument
(
'--validate'
,
action
=
'store_true'
,
help
=
'Perform validation in library.'
)
parser
.
add_argument
(
'--handle'
,
action
=
'store_true'
,
help
=
'Use AIO handle.'
)
parser
.
add_argument
(
'--loops'
,
type
=
int
,
default
=
1
,
help
=
'Count of operation repetitions'
)
parser
.
add_argument
(
'--io_parallel'
,
type
=
int
,
default
=
None
,
help
=
'Per iop parallelism'
)
parser
.
add_argument
(
'--gpu'
,
action
=
'store_true'
,
help
=
'Use GPU memory'
)
args
=
parser
.
parse_args
()
print
(
f
'args =
{
args
}
'
)
return
args
def
validate_args
(
args
):
if
args
.
read_file
and
not
os
.
path
.
isfile
(
args
.
read_file
):
print
(
f
'args validation error:
{
args
.
read_file
}
not found'
)
return
False
return
True
def
main
():
print
(
f
'Testing deepspeed_aio python frontend'
)
args
=
parse_arguments
()
refine_args
(
args
)
if
not
validate_args
(
args
):
quit
()
mp
.
set_start_method
(
'spawn'
)
multiprocess_function
=
aio_handle_multiprocessing
if
args
.
handle
else
aio_basic_multiprocessing
if
args
.
read_file
:
multiprocess_function
(
args
,
True
)
if
args
.
write_file
:
multiprocess_function
(
args
,
False
)
if
__name__
==
"__main__"
:
main
()
deepspeed/ops/csrc/aio/py_test/test_ds_aio_utils.py
deleted
100644 → 0
View file @
1b2721ad
"""
Copyright 2020 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
os
BYTES_PER_GB
=
1024
**
3
LOG_TIDS
=
[
0
]
def
task_log
(
tid
,
msg
):
if
tid
in
LOG_TIDS
:
print
(
f
'tid
{
tid
}
:
{
msg
}
'
)
def
task_barrier
(
barrier
,
num_parties
):
assert
barrier
.
parties
==
num_parties
barrier
.
wait
()
assert
barrier
.
broken
==
False
def
report_results
(
args
,
read_op
,
pool_results
):
#print(f'pool_results = {pool_results}')
io_string
=
'Read'
if
read_op
else
'Write'
if
None
in
pool_results
:
print
(
f
'Failure in one of
{
args
.
threads
}
{
io_string
}
processes'
)
return
total_bytes
=
sum
([
num_bytes
for
_
,
_
,
num_bytes
in
pool_results
])
task_latency_sec
=
max
([
sec
for
_
,
sec
,
_
in
pool_results
])
task_speed_GB
=
total_bytes
/
task_latency_sec
/
BYTES_PER_GB
print
(
f
'Task
{
io_string
}
Latency =
{
task_latency_sec
}
sec'
)
print
(
f
'Task
{
io_string
}
Speed =
{
task_speed_GB
}
GB/sec'
)
e2e_latency_sec
=
max
([
sec
for
sec
,
_
,
_
in
pool_results
])
e2e_speed_GB
=
total_bytes
/
e2e_latency_sec
/
BYTES_PER_GB
print
(
f
'E2E
{
io_string
}
Latency =
{
e2e_latency_sec
}
sec'
)
print
(
f
'E2E
{
io_string
}
Speed =
{
e2e_speed_GB
}
GB/sec'
)
def
refine_integer_value
(
value
):
unit_dict
=
{
'K'
:
1024
,
'M'
:
1024
**
2
,
'G'
:
1024
**
3
}
if
value
[
-
1
]
in
list
(
unit_dict
.
keys
()):
int_value
=
int
(
value
[:
-
1
])
*
unit_dict
[
value
[
-
1
]]
return
int_value
return
int
(
value
)
def
refine_args
(
args
):
if
args
.
write_size
and
type
(
args
.
write_size
)
==
str
:
args
.
write_size
=
refine_integer_value
(
args
.
write_size
)
if
args
.
block_size
and
type
(
args
.
block_size
)
==
str
:
args
.
block_size
=
refine_integer_value
(
args
.
block_size
)
deepspeed/ops/csrc/aio/py_test/validate_async_io.py
deleted
100644 → 0
View file @
1b2721ad
"""
Copyright 2021 The Microsoft DeepSpeed Team
Licensed under the MIT license.
Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
"""
import
deepspeed
from
deepspeed.ops.aio
import
AsyncIOBuilder
assert
AsyncIOBuilder
().
is_compatible
()
deepspeed/ops/csrc/common/custom_cuda_kernel.cu
deleted
100644 → 0
View file @
1b2721ad
#include "custom_cuda_layers.h"
__global__
void
param_update_kernel
(
const
float
*
input
,
__half
*
output
,
int
size
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
<
size
)
{
output
[
id
]
=
(
__half
)
input
[
id
];
}
}
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
cudaStream_t
stream
)
{
int
threads
=
1024
;
dim3
grid_dim
((
size
-
1
)
/
threads
+
1
);
dim3
block_dim
(
threads
);
param_update_kernel
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
input
,
output
,
size
);
}
__global__
void
param_update_kernel_half
(
const
float
*
input
,
__half
*
output
,
int
size
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
__half2
*
output_cast
=
reinterpret_cast
<
__half2
*>
(
output
);
if
(
id
<
size
)
{
float
input_f
=
input
[
id
];
__half2
*
input_h
=
reinterpret_cast
<
__half2
*>
(
&
input_f
);
output_cast
[
id
]
=
*
input_h
;
}
}
void
launch_param_update_half
(
const
float
*
input
,
__half
*
output
,
int
size
,
cudaStream_t
stream
)
{
int
threads
=
1024
;
size
/=
2
;
dim3
grid_dim
((
size
-
1
)
/
threads
+
1
);
dim3
block_dim
(
threads
);
param_update_kernel_half
<<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
input
,
output
,
size
);
}
deepspeed/ops/csrc/common/custom_hip_kernel.hip
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
__global__ void param_update_kernel(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < size) { output[id] = (__half)input[id]; }
}
void launch_param_update(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
__global__ void param_update_kernel_half(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
__half2* output_cast = reinterpret_cast<__half2*>(output);
if (id < size) {
float input_f = input[id];
__half2* input_h = reinterpret_cast<__half2*>(&input_f);
output_cast[id] = *input_h;
}
}
void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
size /= 2;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel_half), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
deepspeed/ops/csrc/includes/StopWatch.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#ifdef _WIN32
#include <windows.h>
#else
#include <time.h>
#endif
#ifdef _WIN32
class
Stopwatch
{
private:
double
m_total_time
;
LARGE_INTEGER
m_start_time
;
public:
Stopwatch
()
{
m_total_time
=
0.0
;
}
~
Stopwatch
()
{}
void
Reset
()
{
m_total_time
=
0.0
;
}
void
Start
()
{
QueryPerformanceCounter
(
&
m_start_time
);
}
void
Restart
()
{
m_total_time
=
0.0
;
QueryPerformanceCounter
(
&
m_start_time
);
}
void
Stop
()
{
LARGE_INTEGER
frequency
;
LARGE_INTEGER
stop_time
;
QueryPerformanceFrequency
(
&
frequency
);
QueryPerformanceCounter
(
&
stop_time
);
m_total_time
+=
((
double
)(
stop_time
.
QuadPart
-
m_start_time
.
QuadPart
)
/
(
double
)
frequency
.
QuadPart
);
}
double
GetTimeInSeconds
()
{
return
m_total_time
;
}
};
#else
class
Stopwatch
{
private:
double
m_total_time
;
struct
timespec
m_start_time
;
bool
m_is_started
;
public:
Stopwatch
()
{
m_total_time
=
0.0
;
m_is_started
=
false
;
}
~
Stopwatch
()
{}
void
Reset
()
{
m_total_time
=
0.0
;
}
void
Start
()
{
clock_gettime
(
CLOCK_MONOTONIC
,
&
m_start_time
);
m_is_started
=
true
;
}
void
Restart
()
{
m_total_time
=
0.0
;
clock_gettime
(
CLOCK_MONOTONIC
,
&
m_start_time
);
m_is_started
=
true
;
}
void
Stop
()
{
if
(
m_is_started
)
{
m_is_started
=
false
;
struct
timespec
end_time
;
clock_gettime
(
CLOCK_MONOTONIC
,
&
end_time
);
m_total_time
+=
(
double
)(
end_time
.
tv_sec
-
m_start_time
.
tv_sec
)
+
(
double
)(
end_time
.
tv_nsec
-
m_start_time
.
tv_nsec
)
/
1e9
;
}
}
double
GetTimeInSeconds
()
{
if
(
m_is_started
)
{
Stop
();
Start
();
}
return
m_total_time
;
}
};
#endif
deepspeed/ops/csrc/includes/Timer.h
deleted
100644 → 0
View file @
1b2721ad
#ifndef __TIMER_H__
#define __TIMER_H__
#include <cuda_runtime.h>
#include <chrono>
#include "cuda.h"
class
GPUTimer
{
cudaEvent_t
start
,
stop
;
public:
GPUTimer
()
{
cudaEventCreate
(
&
start
);
cudaEventCreate
(
&
stop
);
}
~
GPUTimer
()
{
cudaEventDestroy
(
start
);
cudaEventDestroy
(
stop
);
}
inline
void
Record
()
{
cudaEventRecord
(
start
);
}
inline
void
Elapsed
(
float
&
time_elapsed
)
{
cudaEventRecord
(
stop
);
cudaEventSynchronize
(
stop
);
cudaEventElapsedTime
(
&
time_elapsed
,
start
,
stop
);
}
};
class
CPUTimer
{
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
public:
CPUTimer
()
:
start
(
std
::
chrono
::
high_resolution_clock
::
now
())
{}
inline
void
Reset
()
{
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
inline
float
Elapsed
()
{
auto
temp
=
start
;
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
return
(
float
)(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
start
-
temp
).
count
()
/
1e3
);
}
};
#endif
deepspeed/ops/csrc/includes/Timer_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#ifndef __TIMER_H__
#define __TIMER_H__
#include <hip/hip_runtime.h>
#include <chrono>
#include "hip/hip_runtime.h"
class
GPUTimer
{
hipEvent_t
start
,
stop
;
public:
GPUTimer
()
{
hipEventCreate
(
&
start
);
hipEventCreate
(
&
stop
);
}
~
GPUTimer
()
{
hipEventDestroy
(
start
);
hipEventDestroy
(
stop
);
}
inline
void
Record
()
{
hipEventRecord
(
start
);
}
inline
void
Elapsed
(
float
&
time_elapsed
)
{
hipEventRecord
(
stop
);
hipEventSynchronize
(
stop
);
hipEventElapsedTime
(
&
time_elapsed
,
start
,
stop
);
}
};
class
CPUTimer
{
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
public:
CPUTimer
()
:
start
(
std
::
chrono
::
high_resolution_clock
::
now
())
{}
inline
void
Reset
()
{
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
inline
float
Elapsed
()
{
auto
temp
=
start
;
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
return
(
float
)(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
start
-
temp
).
count
()
/
1e3
);
}
};
#endif
deepspeed/ops/csrc/includes/compat.h
deleted
100644 → 0
View file @
1b2721ad
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#ifndef TORCH_CHECK
#define TORCH_CHECK AT_CHECK
#endif
#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif
deepspeed/ops/csrc/includes/context.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#include <ATen/cuda/CUDAContext.h>
#include <cuda_runtime_api.h>
#include <cassert>
#include <iostream>
#include <vector>
#include "cublas_v2.h"
#include "cuda.h"
#include "curand.h"
#include "gemm_test.h"
#define WARP_SIZE 32
#define CUDA_CHECK(callstr) \
{ \
cudaError_t error_code = callstr; \
if (error_code != cudaSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
#define DS_CUDA_NUM_THREADS 512
#define DS_MAXIMUM_NUM_BLOCKS 262144
inline
int
DS_GET_BLOCKS
(
const
int
N
)
{
return
(
std
::
max
)(
(
std
::
min
)((
N
+
DS_CUDA_NUM_THREADS
-
1
)
/
DS_CUDA_NUM_THREADS
,
DS_MAXIMUM_NUM_BLOCKS
),
// Use at least 1 block, since CUDA does not allow empty block
1
);
}
class
Context
{
public:
Context
()
:
_workspace
(
nullptr
),
_seed
(
42
),
_curr_offset
(
0
)
{
curandCreateGenerator
(
&
_gen
,
CURAND_RNG_PSEUDO_DEFAULT
);
curandSetPseudoRandomGeneratorSeed
(
_gen
,
123
);
if
(
cublasCreate
(
&
_cublasHandle
)
!=
CUBLAS_STATUS_SUCCESS
)
{
auto
message
=
std
::
string
(
"Fail to create cublas handle."
);
std
::
cerr
<<
message
<<
std
::
endl
;
throw
std
::
runtime_error
(
message
);
}
}
virtual
~
Context
()
{
cublasDestroy
(
_cublasHandle
);
cudaFree
(
_workspace
);
}
static
Context
&
Instance
()
{
static
Context
_ctx
;
return
_ctx
;
}
void
SetWorkSpace
(
void
*
workspace
)
{
if
(
!
workspace
)
{
throw
std
::
runtime_error
(
"Workspace is null."
);
}
_workspace
=
workspace
;
}
void
*
GetWorkSpace
()
{
return
_workspace
;
}
curandGenerator_t
&
GetRandGenerator
()
{
return
_gen
;
}
cudaStream_t
GetCurrentStream
()
{
// get current pytorch stream.
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
return
stream
;
}
cudaStream_t
GetNewStream
()
{
return
at
::
cuda
::
getStreamFromPool
();
}
cublasHandle_t
GetCublasHandle
()
{
return
_cublasHandle
;
}
std
::
pair
<
uint64_t
,
uint64_t
>
IncrementOffset
(
uint64_t
offset_inc
)
{
uint64_t
offset
=
_curr_offset
;
_curr_offset
+=
offset_inc
;
return
std
::
pair
<
uint64_t
,
uint64_t
>
(
_seed
,
offset
);
}
void
SetSeed
(
uint64_t
new_seed
)
{
_seed
=
new_seed
;
}
void
TestGemmFP16
(
bool
test_gemm
,
int
batch_size
,
int
seq_len
,
int
head_num
,
int
size_per_head
)
{
// avoid rerun.
if
(
_gemm_algos
.
size
()
>
0
)
return
;
if
(
test_gemm
)
{
cublasHandle_t
handle
=
GetCublasHandle
();
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_qkv_fw
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
head_num
*
size_per_head
,
// N
head_num
*
size_per_head
,
// K
CUBLAS_OP_T
,
CUBLAS_OP_N
,
handle
));
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_inter
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
4
*
head_num
*
size_per_head
,
// N
head_num
*
size_per_head
,
// K
CUBLAS_OP_T
,
CUBLAS_OP_N
,
handle
));
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_output
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
head_num
*
size_per_head
,
// N
4
*
head_num
*
size_per_head
,
// K
CUBLAS_OP_T
,
CUBLAS_OP_N
,
handle
));
std
::
unique_ptr
<
StridedGemmTest
<
__half
>>
test_attn_scores
(
new
StridedGemmTest
<
__half
>
(
batch_size
*
head_num
,
// batch
seq_len
,
// M
seq_len
,
// N
size_per_head
,
// K
CUBLAS_OP_T
,
CUBLAS_OP_N
,
handle
));
std
::
unique_ptr
<
StridedGemmTest
<
__half
>>
test_attn_context
(
new
StridedGemmTest
<
__half
>
(
batch_size
*
head_num
,
// batch
size_per_head
,
// M
seq_len
,
// N
seq_len
,
// K
CUBLAS_OP_N
,
CUBLAS_OP_N
,
handle
));
_gemm_algos
.
push_back
(
test_qkv_fw
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_inter
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_output
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_attn_scores
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_attn_context
->
TestAlgo
(
100
));
}
else
{
// Use default algo.
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
}
}
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
GetGemmAlgos
()
const
{
return
_gemm_algos
;
}
private:
curandGenerator_t
_gen
;
cublasHandle_t
_cublasHandle
;
void
*
_workspace
;
uint64_t
_seed
;
uint64_t
_curr_offset
;
std
::
vector
<
std
::
array
<
int
,
3
>>
_gemm_algos
;
};
deepspeed/ops/csrc/includes/context_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <ATen/hip/HIPContext.h>
#include <hip/hip_runtime_api.h>
#include <cassert>
#include <iostream>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand/hiprand.h"
#include "gemm_test_hip.h"
#define WARP_SIZE 32
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
#define DS_CUDA_NUM_THREADS 512
#define DS_MAXIMUM_NUM_BLOCKS 262144
inline
int
DS_GET_BLOCKS
(
const
int
N
)
{
return
(
std
::
max
)(
(
std
::
min
)((
N
+
DS_CUDA_NUM_THREADS
-
1
)
/
DS_CUDA_NUM_THREADS
,
DS_MAXIMUM_NUM_BLOCKS
),
// Use at least 1 block, since CUDA does not allow empty block
1
);
}
class
Context
{
public:
Context
()
:
_workspace
(
nullptr
),
_seed
(
42
),
_curr_offset
(
0
)
{
hiprandCreateGenerator
(
&
_gen
,
HIPRAND_RNG_PSEUDO_DEFAULT
);
hiprandSetPseudoRandomGeneratorSeed
(
_gen
,
123
);
if
(
rocblas_create_handle
(
&
_cublasHandle
)
!=
rocblas_status_success
)
{
auto
message
=
std
::
string
(
"Fail to create cublas handle."
);
std
::
cerr
<<
message
<<
std
::
endl
;
throw
std
::
runtime_error
(
message
);
}
}
virtual
~
Context
()
{
rocblas_destroy_handle
(
_cublasHandle
);
hipFree
(
_workspace
);
}
static
Context
&
Instance
()
{
static
Context
_ctx
;
return
_ctx
;
}
void
SetWorkSpace
(
void
*
workspace
)
{
if
(
!
workspace
)
{
throw
std
::
runtime_error
(
"Workspace is null."
);
}
_workspace
=
workspace
;
}
void
*
GetWorkSpace
()
{
return
_workspace
;
}
hiprandGenerator_t
&
GetRandGenerator
()
{
return
_gen
;
}
hipStream_t
GetCurrentStream
()
{
// get current pytorch stream.
hipStream_t
stream
=
at
::
hip
::
getCurrentHIPStreamMasqueradingAsCUDA
();
return
stream
;
}
hipStream_t
GetNewStream
()
{
return
at
::
hip
::
getStreamFromPoolMasqueradingAsCUDA
();
}
rocblas_handle
GetCublasHandle
()
{
return
_cublasHandle
;
}
std
::
pair
<
uint64_t
,
uint64_t
>
IncrementOffset
(
uint64_t
offset_inc
)
{
uint64_t
offset
=
_curr_offset
;
_curr_offset
+=
offset_inc
;
return
std
::
pair
<
uint64_t
,
uint64_t
>
(
_seed
,
offset
);
}
void
SetSeed
(
uint64_t
new_seed
)
{
_seed
=
new_seed
;
}
void
TestGemmFP16
(
bool
test_gemm
,
int
batch_size
,
int
seq_len
,
int
head_num
,
int
size_per_head
)
{
// avoid rerun.
if
(
_gemm_algos
.
size
()
>
0
)
return
;
if
(
test_gemm
)
{
rocblas_handle
handle
=
GetCublasHandle
();
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_qkv_fw
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
head_num
*
size_per_head
,
// N
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_inter
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
4
*
head_num
*
size_per_head
,
// N
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_output
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
head_num
*
size_per_head
,
// N
4
*
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
StridedGemmTest
<
__half
>>
test_attn_scores
(
new
StridedGemmTest
<
__half
>
(
batch_size
*
head_num
,
// batch
seq_len
,
// M
seq_len
,
// N
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
StridedGemmTest
<
__half
>>
test_attn_context
(
new
StridedGemmTest
<
__half
>
(
batch_size
*
head_num
,
// batch
size_per_head
,
// M
seq_len
,
// N
seq_len
,
// K
rocblas_operation_none
,
rocblas_operation_none
,
handle
));
_gemm_algos
.
push_back
(
test_qkv_fw
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_inter
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_output
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_attn_scores
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_attn_context
->
TestAlgo
(
100
));
}
else
{
// Use default algo.
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
}
}
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
GetGemmAlgos
()
const
{
return
_gemm_algos
;
}
private:
hiprandGenerator_t
_gen
;
rocblas_handle
_cublasHandle
;
void
*
_workspace
;
uint64_t
_seed
;
uint64_t
_curr_offset
;
std
::
vector
<
std
::
array
<
int
,
3
>>
_gemm_algos
;
};
deepspeed/ops/csrc/includes/cpu_adagrad.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "cuda.h"
#include "custom_cuda_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class
Adagrad_Optimizer
{
public:
Adagrad_Optimizer
(
float
alpha
=
1e-2
,
float
eps
=
1e-8
,
float
weight_decay
=
0
)
:
_alpha
(
alpha
),
_eps
(
eps
),
_weight_decay
(
weight_decay
),
_buf_index
(
false
)
{
cudaMallocHost
((
void
**
)
_doubled_buffer
,
TILE
*
sizeof
(
float
));
cudaMallocHost
((
void
**
)(
_doubled_buffer
+
1
),
TILE
*
sizeof
(
float
));
_streams
[
0
]
=
Context
::
Instance
().
GetCurrentStream
();
_streams
[
1
]
=
Context
::
Instance
().
GetNewStream
();
}
~
Adagrad_Optimizer
()
{
cudaFreeHost
(
_doubled_buffer
[
0
]);
cudaFreeHost
(
_doubled_buffer
[
1
]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
,
bool
half_precision
=
false
);
#endif
STEP
(
1
)
STEP
(
4
)
STEP
(
8
)
inline
void
SynchronizeStreams
()
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
cudaStreamSynchronize
(
_streams
[
i
]);
}
inline
void
IncrementStep
(
size_t
step
)
{
_step
++
;
if
(
_step
!=
step
)
{
_step
=
step
;
}
}
inline
void
update_state
(
float
lr
,
float
epsilon
,
float
weight_decay
)
{
_alpha
=
lr
;
_eps
=
epsilon
;
_weight_decay
=
weight_decay
;
}
private:
float
_alpha
;
float
_eps
;
float
_weight_decay
;
float
_betta1_t
;
float
_betta2_t
;
size_t
_step
;
float
*
_doubled_buffer
[
2
];
bool
_buf_index
;
cudaStream_t
_streams
[
2
];
};
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Adagrad_Optimizer
::
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
new_rounded_size
=
0
;
AVX_Data
eps_4
;
eps_4
.
data
=
SIMD_SET
(
_eps
);
float
step_size
=
-
1
*
_alpha
;
AVX_Data
step_size_4
;
step_size_4
.
data
=
SIMD_SET
(
step_size
);
AVX_Data
weight_decay4
;
if
(
_weight_decay
>
0
)
weight_decay4
.
data
=
SIMD_SET
(
_weight_decay
);
new_rounded_size
=
ROUND_DOWN
(
_param_size
,
SIMD_WIDTH
*
span
);
for
(
size_t
t
=
0
;
t
<
new_rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
new_rounded_size
)
copy_size
=
new_rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
if
((
t
/
TILE
)
>=
2
)
{
cudaStreamSynchronize
(
_streams
[
_buf_index
]);
}
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
SIMD_WIDTH
*
span
)
{
AVX_Data
grad_4
[
span
];
simd_load
<
span
>
(
grad_4
,
grads
+
i
,
half_precision
);
AVX_Data
momentum_4
[
span
];
simd_load
<
span
>
(
momentum_4
,
grads
+
i
,
false
);
AVX_Data
variance_4
[
span
];
simd_load
<
span
>
(
variance_4
,
_exp_avg_sq
+
i
,
false
);
AVX_Data
param_4
[
span
];
simd_load
<
span
>
(
param_4
,
_params
+
i
,
half_precision
);
if
(
_weight_decay
>
0
)
{
simd_fma
<
span
>
(
grad_4
,
param_4
,
weight_decay4
,
grad_4
);
}
simd_fma
<
span
>
(
variance_4
,
grad_4
,
grad_4
,
variance_4
);
simd_sqrt
<
span
>
(
grad_4
,
variance_4
);
simd_add
<
span
>
(
grad_4
,
grad_4
,
eps_4
);
simd_div
<
span
>
(
grad_4
,
momentum_4
,
grad_4
);
simd_fma
<
span
>
(
param_4
,
grad_4
,
step_size_4
,
param_4
);
simd_store
<
span
>
(
_params
+
i
,
param_4
,
half_precision
);
if
(
dev_params
)
{
simd_store
<
span
>
(
_doubled_buffer
[
_buf_index
]
+
(
i
-
t
),
param_4
,
half_precision
);
}
simd_store
<
span
>
(
_exp_avg_sq
+
i
,
variance_4
,
false
);
}
if
(
dev_params
)
{
if
(
half_precision
)
launch_param_update_half
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
else
launch_param_update
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
_buf_index
=
!
_buf_index
;
}
}
*
rounded_size
=
new_rounded_size
;
}
#endif
deepspeed/ops/csrc/includes/cpu_adagrad_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <hip/hip_fp16.h>
#include <hip/hip_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class
Adagrad_Optimizer
{
public:
Adagrad_Optimizer
(
float
alpha
=
1e-2
,
float
eps
=
1e-8
,
float
weight_decay
=
0
)
:
_alpha
(
alpha
),
_eps
(
eps
),
_weight_decay
(
weight_decay
),
_buf_index
(
false
)
{
hipHostMalloc
((
void
**
)
_doubled_buffer
,
TILE
*
sizeof
(
float
));
hipHostMalloc
((
void
**
)(
_doubled_buffer
+
1
),
TILE
*
sizeof
(
float
));
_streams
[
0
]
=
Context
::
Instance
().
GetCurrentStream
();
_streams
[
1
]
=
Context
::
Instance
().
GetNewStream
();
}
~
Adagrad_Optimizer
()
{
hipHostFree
(
_doubled_buffer
[
0
]);
hipHostFree
(
_doubled_buffer
[
1
]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
,
bool
half_precision
=
false
);
#endif
STEP
(
1
)
STEP
(
4
)
STEP
(
8
)
inline
void
SynchronizeStreams
()
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
hipStreamSynchronize
(
_streams
[
i
]);
}
inline
void
IncrementStep
(
size_t
step
)
{
_step
++
;
if
(
_step
!=
step
)
{
_step
=
step
;
}
}
inline
void
update_state
(
float
lr
,
float
epsilon
,
float
weight_decay
)
{
_alpha
=
lr
;
_eps
=
epsilon
;
_weight_decay
=
weight_decay
;
}
private:
float
_alpha
;
float
_eps
;
float
_weight_decay
;
float
_betta1_t
;
float
_betta2_t
;
size_t
_step
;
float
*
_doubled_buffer
[
2
];
bool
_buf_index
;
hipStream_t
_streams
[
2
];
};
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Adagrad_Optimizer
::
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
new_rounded_size
=
0
;
AVX_Data
eps_4
;
eps_4
.
data
=
SIMD_SET
(
_eps
);
float
step_size
=
-
1
*
_alpha
;
AVX_Data
step_size_4
;
step_size_4
.
data
=
SIMD_SET
(
step_size
);
AVX_Data
weight_decay4
;
if
(
_weight_decay
>
0
)
weight_decay4
.
data
=
SIMD_SET
(
_weight_decay
);
new_rounded_size
=
ROUND_DOWN
(
_param_size
,
SIMD_WIDTH
*
span
);
for
(
size_t
t
=
0
;
t
<
new_rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
new_rounded_size
)
copy_size
=
new_rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
if
((
t
/
TILE
)
>=
2
)
{
hipStreamSynchronize
(
_streams
[
_buf_index
]);
}
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
SIMD_WIDTH
*
span
)
{
AVX_Data
grad_4
[
span
];
simd_load
<
span
>
(
grad_4
,
grads
+
i
,
half_precision
);
AVX_Data
momentum_4
[
span
];
simd_load
<
span
>
(
momentum_4
,
grads
+
i
,
false
);
AVX_Data
variance_4
[
span
];
simd_load
<
span
>
(
variance_4
,
_exp_avg_sq
+
i
,
false
);
AVX_Data
param_4
[
span
];
simd_load
<
span
>
(
param_4
,
_params
+
i
,
half_precision
);
if
(
_weight_decay
>
0
)
{
simd_fma
<
span
>
(
grad_4
,
param_4
,
weight_decay4
,
grad_4
);
}
simd_fma
<
span
>
(
variance_4
,
grad_4
,
grad_4
,
variance_4
);
simd_sqrt
<
span
>
(
grad_4
,
variance_4
);
simd_add
<
span
>
(
grad_4
,
grad_4
,
eps_4
);
simd_div
<
span
>
(
grad_4
,
momentum_4
,
grad_4
);
simd_fma
<
span
>
(
param_4
,
grad_4
,
step_size_4
,
param_4
);
simd_store
<
span
>
(
_params
+
i
,
param_4
,
half_precision
);
if
(
dev_params
)
{
simd_store
<
span
>
(
_doubled_buffer
[
_buf_index
]
+
(
i
-
t
),
param_4
,
half_precision
);
}
simd_store
<
span
>
(
_exp_avg_sq
+
i
,
variance_4
,
false
);
}
if
(
dev_params
)
{
if
(
half_precision
)
launch_param_update_half
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
else
launch_param_update
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
_buf_index
=
!
_buf_index
;
}
}
*
rounded_size
=
new_rounded_size
;
}
#endif
deepspeed/ops/csrc/includes/cpu_adam.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "cuda.h"
#include "custom_cuda_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class
Adam_Optimizer
{
public:
Adam_Optimizer
(
float
alpha
=
1e-3
,
float
betta1
=
0.9
,
float
betta2
=
0.999
,
float
eps
=
1e-8
,
float
weight_decay
=
0
,
bool
adamw_mode
=
true
)
:
_alpha
(
alpha
),
_betta1
(
betta1
),
_betta2
(
betta2
),
_eps
(
eps
),
_weight_decay
(
weight_decay
),
_betta1_t
(
1.0
),
_betta2_t
(
1.0
),
_step
(
0
),
_buf_index
(
false
),
_adamw_mode
(
adamw_mode
)
{
cudaMallocHost
((
void
**
)
_doubled_buffer
,
TILE
*
sizeof
(
float
));
cudaMallocHost
((
void
**
)(
_doubled_buffer
+
1
),
TILE
*
sizeof
(
float
));
_streams
[
0
]
=
Context
::
Instance
().
GetCurrentStream
();
_streams
[
1
]
=
Context
::
Instance
().
GetNewStream
();
}
~
Adam_Optimizer
()
{
cudaFreeHost
(
_doubled_buffer
[
0
]);
cudaFreeHost
(
_doubled_buffer
[
1
]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
,
bool
half_precision
=
false
);
#endif
STEP
(
1
)
STEP
(
4
)
STEP
(
8
)
inline
void
SynchronizeStreams
()
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
cudaStreamSynchronize
(
_streams
[
i
]);
}
inline
void
IncrementStep
(
size_t
step
,
float
beta1
,
float
beta2
)
{
if
(
beta1
!=
_betta1
||
beta2
!=
_betta2
)
{
_step
=
step
;
_betta1
=
beta1
;
_betta2
=
beta2
;
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
}
else
{
_step
++
;
if
(
_step
!=
step
)
{
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
_step
=
step
;
}
else
{
_betta1_t
*=
_betta1
;
_betta2_t
*=
_betta2
;
}
}
}
inline
void
update_state
(
float
lr
,
float
epsilon
,
float
weight_decay
,
bool
bias_correction
)
{
_alpha
=
lr
;
_eps
=
epsilon
;
_weight_decay
=
weight_decay
;
_bias_correction1
=
1.0
f
;
_bias_correction2
=
1.0
f
;
if
(
bias_correction
==
1
)
{
_bias_correction1
=
1
-
_betta1_t
;
_bias_correction2
=
1
/
sqrt
(
1
-
_betta2_t
);
}
}
private:
float
_alpha
;
float
_betta1
;
float
_betta2
;
float
_eps
;
float
_weight_decay
;
float
_betta1_t
;
float
_betta2_t
;
size_t
_step
;
float
_bias_correction1
;
float
_bias_correction2
;
float
*
_doubled_buffer
[
2
];
bool
_buf_index
;
bool
_adamw_mode
;
cudaStream_t
_streams
[
2
];
};
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Adam_Optimizer
::
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
new_rounded_size
=
0
;
AVX_Data
betta1_4
;
betta1_4
.
data
=
SIMD_SET
(
_betta1
);
AVX_Data
betta2_4
;
betta2_4
.
data
=
SIMD_SET
(
_betta2
);
float
betta1_minus1
=
1
-
_betta1
;
float
betta2_minus1
=
1
-
_betta2
;
AVX_Data
betta1_minus1_4
;
betta1_minus1_4
.
data
=
SIMD_SET
(
betta1_minus1
);
AVX_Data
betta2_minus1_4
;
betta2_minus1_4
.
data
=
SIMD_SET
(
betta2_minus1
);
AVX_Data
bias2_sqrt
;
bias2_sqrt
.
data
=
SIMD_SET
(
_bias_correction2
);
AVX_Data
eps_4
;
eps_4
.
data
=
SIMD_SET
(
_eps
);
float
step_size
=
-
1
*
_alpha
/
_bias_correction1
;
AVX_Data
step_size_4
;
step_size_4
.
data
=
SIMD_SET
(
step_size
);
float
w_decay
=
-
1
*
_alpha
*
_weight_decay
;
AVX_Data
weight_decay4
;
if
(
_weight_decay
>
0
)
weight_decay4
.
data
=
(
_adamw_mode
?
SIMD_SET
(
w_decay
)
:
SIMD_SET
(
_weight_decay
));
new_rounded_size
=
ROUND_DOWN
(
_param_size
,
SIMD_WIDTH
*
span
);
for
(
size_t
t
=
0
;
t
<
new_rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
new_rounded_size
)
copy_size
=
new_rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
if
((
t
/
TILE
)
>=
2
)
{
cudaStreamSynchronize
(
_streams
[
_buf_index
]);
}
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
SIMD_WIDTH
*
span
)
{
AVX_Data
grad_4
[
span
];
simd_load
<
span
>
(
grad_4
,
grads
+
i
,
half_precision
);
AVX_Data
momentum_4
[
span
];
simd_load
<
span
>
(
momentum_4
,
_exp_avg
+
i
,
false
);
AVX_Data
variance_4
[
span
];
simd_load
<
span
>
(
variance_4
,
_exp_avg_sq
+
i
,
false
);
AVX_Data
param_4
[
span
];
simd_load
<
span
>
(
param_4
,
_params
+
i
,
half_precision
);
if
(
_weight_decay
>
0
&&
!
_adamw_mode
)
{
simd_fma
<
span
>
(
grad_4
,
param_4
,
weight_decay4
,
grad_4
);
}
simd_mul
<
span
>
(
momentum_4
,
momentum_4
,
betta1_4
);
simd_fma
<
span
>
(
momentum_4
,
grad_4
,
betta1_minus1_4
,
momentum_4
);
simd_mul
<
span
>
(
variance_4
,
variance_4
,
betta2_4
);
simd_mul
<
span
>
(
grad_4
,
grad_4
,
grad_4
);
simd_fma
<
span
>
(
variance_4
,
grad_4
,
betta2_minus1_4
,
variance_4
);
simd_sqrt
<
span
>
(
grad_4
,
variance_4
);
simd_fma
<
span
>
(
grad_4
,
grad_4
,
bias2_sqrt
,
eps_4
);
simd_div
<
span
>
(
grad_4
,
momentum_4
,
grad_4
);
if
(
_weight_decay
>
0
&&
_adamw_mode
)
{
simd_fma
<
span
>
(
param_4
,
param_4
,
weight_decay4
,
param_4
);
}
simd_fma
<
span
>
(
param_4
,
grad_4
,
step_size_4
,
param_4
);
simd_store
<
span
>
(
_params
+
i
,
param_4
,
half_precision
);
if
(
dev_params
)
{
simd_store
<
span
>
(
_doubled_buffer
[
_buf_index
]
+
(
i
-
t
),
param_4
,
half_precision
);
}
simd_store
<
span
>
(
_exp_avg
+
i
,
momentum_4
,
false
);
simd_store
<
span
>
(
_exp_avg_sq
+
i
,
variance_4
,
false
);
}
if
(
dev_params
)
{
if
(
half_precision
)
launch_param_update_half
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
else
launch_param_update
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
_buf_index
=
!
_buf_index
;
}
}
*
rounded_size
=
new_rounded_size
;
}
#endif
deepspeed/ops/csrc/includes/cpu_adam_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <hip/hip_fp16.h>
#include <hip/hip_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class
Adam_Optimizer
{
public:
Adam_Optimizer
(
float
alpha
=
1e-3
,
float
betta1
=
0.9
,
float
betta2
=
0.999
,
float
eps
=
1e-8
,
float
weight_decay
=
0
,
bool
adamw_mode
=
true
)
:
_alpha
(
alpha
),
_betta1
(
betta1
),
_betta2
(
betta2
),
_eps
(
eps
),
_weight_decay
(
weight_decay
),
_betta1_t
(
1.0
),
_betta2_t
(
1.0
),
_step
(
0
),
_buf_index
(
false
),
_adamw_mode
(
adamw_mode
)
{
hipHostMalloc
((
void
**
)
_doubled_buffer
,
TILE
*
sizeof
(
float
));
hipHostMalloc
((
void
**
)(
_doubled_buffer
+
1
),
TILE
*
sizeof
(
float
));
_streams
[
0
]
=
Context
::
Instance
().
GetCurrentStream
();
_streams
[
1
]
=
Context
::
Instance
().
GetNewStream
();
}
~
Adam_Optimizer
()
{
hipHostFree
(
_doubled_buffer
[
0
]);
hipHostFree
(
_doubled_buffer
[
1
]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
,
bool
half_precision
=
false
);
#endif
STEP
(
1
)
STEP
(
4
)
STEP
(
8
)
inline
void
SynchronizeStreams
()
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
hipStreamSynchronize
(
_streams
[
i
]);
}
inline
void
IncrementStep
(
size_t
step
,
float
beta1
,
float
beta2
)
{
if
(
beta1
!=
_betta1
||
beta2
!=
_betta2
)
{
_step
=
step
;
_betta1
=
beta1
;
_betta2
=
beta2
;
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
}
else
{
_step
++
;
if
(
_step
!=
step
)
{
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
_step
=
step
;
}
else
{
_betta1_t
*=
_betta1
;
_betta2_t
*=
_betta2
;
}
}
}
inline
void
update_state
(
float
lr
,
float
epsilon
,
float
weight_decay
,
bool
bias_correction
)
{
_alpha
=
lr
;
_eps
=
epsilon
;
_weight_decay
=
weight_decay
;
_bias_correction1
=
1.0
f
;
_bias_correction2
=
1.0
f
;
if
(
bias_correction
==
1
)
{
_bias_correction1
=
1
-
_betta1_t
;
_bias_correction2
=
1
/
sqrt
(
1
-
_betta2_t
);
}
}
private:
float
_alpha
;
float
_betta1
;
float
_betta2
;
float
_eps
;
float
_weight_decay
;
float
_betta1_t
;
float
_betta2_t
;
size_t
_step
;
float
_bias_correction1
;
float
_bias_correction2
;
float
*
_doubled_buffer
[
2
];
bool
_buf_index
;
bool
_adamw_mode
;
hipStream_t
_streams
[
2
];
};
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Adam_Optimizer
::
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
new_rounded_size
=
0
;
AVX_Data
betta1_4
;
betta1_4
.
data
=
SIMD_SET
(
_betta1
);
AVX_Data
betta2_4
;
betta2_4
.
data
=
SIMD_SET
(
_betta2
);
float
betta1_minus1
=
1
-
_betta1
;
float
betta2_minus1
=
1
-
_betta2
;
AVX_Data
betta1_minus1_4
;
betta1_minus1_4
.
data
=
SIMD_SET
(
betta1_minus1
);
AVX_Data
betta2_minus1_4
;
betta2_minus1_4
.
data
=
SIMD_SET
(
betta2_minus1
);
AVX_Data
bias2_sqrt
;
bias2_sqrt
.
data
=
SIMD_SET
(
_bias_correction2
);
AVX_Data
eps_4
;
eps_4
.
data
=
SIMD_SET
(
_eps
);
float
step_size
=
-
1
*
_alpha
/
_bias_correction1
;
AVX_Data
step_size_4
;
step_size_4
.
data
=
SIMD_SET
(
step_size
);
float
w_decay
=
-
1
*
_alpha
*
_weight_decay
;
AVX_Data
weight_decay4
;
if
(
_weight_decay
>
0
)
weight_decay4
.
data
=
(
_adamw_mode
?
SIMD_SET
(
w_decay
)
:
SIMD_SET
(
_weight_decay
));
new_rounded_size
=
ROUND_DOWN
(
_param_size
,
SIMD_WIDTH
*
span
);
for
(
size_t
t
=
0
;
t
<
new_rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
new_rounded_size
)
copy_size
=
new_rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
if
((
t
/
TILE
)
>=
2
)
{
hipStreamSynchronize
(
_streams
[
_buf_index
]);
}
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
SIMD_WIDTH
*
span
)
{
AVX_Data
grad_4
[
span
];
simd_load
<
span
>
(
grad_4
,
grads
+
i
,
half_precision
);
AVX_Data
momentum_4
[
span
];
simd_load
<
span
>
(
momentum_4
,
_exp_avg
+
i
,
false
);
AVX_Data
variance_4
[
span
];
simd_load
<
span
>
(
variance_4
,
_exp_avg_sq
+
i
,
false
);
AVX_Data
param_4
[
span
];
simd_load
<
span
>
(
param_4
,
_params
+
i
,
half_precision
);
if
(
_weight_decay
>
0
&&
!
_adamw_mode
)
{
simd_fma
<
span
>
(
grad_4
,
param_4
,
weight_decay4
,
grad_4
);
}
simd_mul
<
span
>
(
momentum_4
,
momentum_4
,
betta1_4
);
simd_fma
<
span
>
(
momentum_4
,
grad_4
,
betta1_minus1_4
,
momentum_4
);
simd_mul
<
span
>
(
variance_4
,
variance_4
,
betta2_4
);
simd_mul
<
span
>
(
grad_4
,
grad_4
,
grad_4
);
simd_fma
<
span
>
(
variance_4
,
grad_4
,
betta2_minus1_4
,
variance_4
);
simd_sqrt
<
span
>
(
grad_4
,
variance_4
);
simd_fma
<
span
>
(
grad_4
,
grad_4
,
bias2_sqrt
,
eps_4
);
simd_div
<
span
>
(
grad_4
,
momentum_4
,
grad_4
);
if
(
_weight_decay
>
0
&&
_adamw_mode
)
{
simd_fma
<
span
>
(
param_4
,
param_4
,
weight_decay4
,
param_4
);
}
simd_fma
<
span
>
(
param_4
,
grad_4
,
step_size_4
,
param_4
);
simd_store
<
span
>
(
_params
+
i
,
param_4
,
half_precision
);
if
(
dev_params
)
{
simd_store
<
span
>
(
_doubled_buffer
[
_buf_index
]
+
(
i
-
t
),
param_4
,
half_precision
);
}
simd_store
<
span
>
(
_exp_avg
+
i
,
momentum_4
,
false
);
simd_store
<
span
>
(
_exp_avg_sq
+
i
,
variance_4
,
false
);
}
if
(
dev_params
)
{
if
(
half_precision
)
launch_param_update_half
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
else
launch_param_update
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
_buf_index
=
!
_buf_index
;
}
}
*
rounded_size
=
new_rounded_size
;
}
#endif
deepspeed/ops/csrc/includes/cublas_wrappers.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#include <assert.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#ifndef __HIP_PLATFORM_HCC__
#include <mma.h>
#endif
#include <stdio.h>
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
#endif
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
#endif
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
#endif
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
#endif
deepspeed/ops/csrc/includes/cublas_wrappers_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <assert.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#ifndef __HIP_PLATFORM_HCC__
#include <mma.h>
#endif
#include <stdio.h>
int
cublas_gemm_ex
(
rocblas_handle
handle
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
#endif
int
cublas_gemm_ex
(
rocblas_handle
handle
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
#endif
int
cublas_strided_batched_gemm
(
rocblas_handle
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
rocblas_operation
op_A
,
rocblas_operation
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
#endif
int
cublas_strided_batched_gemm
(
rocblas_handle
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
rocblas_operation
op_A
,
rocblas_operation
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
#endif
deepspeed/ops/csrc/includes/custom_cuda_layers.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <curand_kernel.h>
#include "context.h"
#include "cublas_wrappers.h"
#define CUDA_CHECK(callstr) \
{ \
cudaError_t error_code = callstr; \
if (error_code != cudaSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define MAX_THREADS 1024
#define THREADS 256
#define MAX_THREAD_STRIDE 32
#define TILE_DIM 32
// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
#define MAX_THREAD_ITERATIONS 8 // Maximum 8K
#define MAX_WARP_NUM 32
#define MAX_REGISTERS 256
#define MAX_REG 256
#define WARP_SIZE_BITS 5
template
<
typename
T
>
void
launch_quantize_kernel
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_sr_quantize_kernel
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_quantize_kernel_asym
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_sr_quantize_kernel_asym
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
cudaStream_t
stream
);
// Fused bias add with gelu activation
template
<
typename
T
>
void
launch_bias_gelu
(
const
T
*
input
,
const
T
*
bias
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_gelu
(
const
T
*
input
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_d_gelu
(
T
*
d_output
,
const
T
*
input
,
const
T
*
bias
,
int
intermediate_size
,
int
batch_size
,
cudaStream_t
stream
);
// Custom fused bias add with layer normalization
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
,
T
*
means
);
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward_nreversible
(
const
T
*
out_grad
,
const
T
*
vals
,
const
T
*
out_grad_trans
,
const
T
*
vals_trans
,
const
T
*
means
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
]);
template
<
typename
T
>
void
Transpose
(
const
T
*
inp_mat
,
T
*
out_mat
,
int
rows
,
int
cols
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward_v2
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
cudaStream_t
stream
);
// Custom softmax with scaling and attention mask addition
template
<
typename
T
>
void
launch_attn_softmax
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
heads
,
int
sequence_length
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_transform_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
cudaStream_t
stream
);
// Custom bias add
template
<
typename
T
>
void
launch_bias_add_transform_0213
(
T
*
outputs
,
const
T
*
vals
,
const
T
*
bias
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
cudaStream_t
stream
,
int
trans_count
);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template
<
typename
T
>
void
launch_transform4d_0213
(
T
*
out
,
const
T
*
in
,
int
batch_size
,
int
heads
,
int
seq_length
,
int
hidden_dim
,
cudaStream_t
stream
,
int
trans_count
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
,
bool
bwd
=
false
);
template
<
typename
T
>
void
launch_dropout
(
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_fuse_transpose_bias_kernel
(
const
T
*
inp
,
T
*
out
,
int
rows
,
int
cols
,
cudaStream_t
stream
);
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
cudaStream_t
stream
);
void
launch_param_update_half
(
const
float
*
input
,
__half
*
output
,
int
size
,
cudaStream_t
stream
);
Prev
1
…
11
12
13
14
15
16
17
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment