Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
4acf0e01
Commit
4acf0e01
authored
Apr 26, 2023
by
aiss
Browse files
delete hip file
parent
7dd68788
Changes
83
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2864 deletions
+0
-2864
csrc/adagrad/cpu_adagrad_hip.cpp
csrc/adagrad/cpu_adagrad_hip.cpp
+0
-228
csrc/adam/cpu_adam_hip.cpp
csrc/adam/cpu_adam_hip.cpp
+0
-293
csrc/adam/custom_hip_kernel.hip
csrc/adam/custom_hip_kernel.hip
+0
-22
csrc/adam/multi_tensor_adam.hip
csrc/adam/multi_tensor_adam.hip
+0
-164
csrc/adam/multi_tensor_apply_hip.cuh
csrc/adam/multi_tensor_apply_hip.cuh
+0
-129
csrc/common/custom_hip_kernel.hip
csrc/common/custom_hip_kernel.hip
+0
-41
csrc/includes/Timer_hip.h
csrc/includes/Timer_hip.h
+0
-48
csrc/includes/context_hip.h
csrc/includes/context_hip.h
+0
-172
csrc/includes/cpu_adagrad_hip.h
csrc/includes/cpu_adagrad_hip.h
+0
-151
csrc/includes/cpu_adam_hip.h
csrc/includes/cpu_adam_hip.h
+0
-226
csrc/includes/cublas_wrappers_hip.h
csrc/includes/cublas_wrappers_hip.h
+0
-88
csrc/includes/custom_hip_layers.h
csrc/includes/custom_hip_layers.h
+0
-304
csrc/includes/dropout_hip.h
csrc/includes/dropout_hip.h
+0
-77
csrc/includes/ds_transformer_hip.h
csrc/includes/ds_transformer_hip.h
+0
-185
csrc/includes/feed_forward_hip.h
csrc/includes/feed_forward_hip.h
+0
-106
csrc/includes/gelu_hip.h
csrc/includes/gelu_hip.h
+0
-37
csrc/includes/gemm_test_hip.h
csrc/includes/gemm_test_hip.h
+0
-328
csrc/includes/general_kernels_hip.h
csrc/includes/general_kernels_hip.h
+0
-52
csrc/includes/normalize_layer_hip.h
csrc/includes/normalize_layer_hip.h
+0
-203
csrc/includes/quantizer_hip.h
csrc/includes/quantizer_hip.h
+0
-10
No files found.
csrc/adagrad/cpu_adagrad_hip.cpp
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#include "cpu_adagrad_hip.h"
#include <hip/hip_runtime_api.h>
#include <math.h>
#include <omp.h>
#include <torch/extension.h>
#include <iostream>
#include <memory>
#include <type_traits>
#include <unordered_map>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand/hiprand.h"
#include "custom_hip_layers.h"
static
std
::
unordered_map
<
int
,
std
::
shared_ptr
<
void
>>
s_optimizers
;
// C++ interface
void
Adagrad_Optimizer
::
Step_1
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX
<
1
>
(
&
rounded_size
,
_params
,
grads
,
_exp_avg_sq
,
_param_size
,
dev_params
,
half_precision
);
#endif
if
(
_param_size
>
rounded_size
)
{
float
step_size
=
-
1
*
_alpha
;
__half
*
grads_cast_h
;
__half
*
params_cast_h
;
if
(
half_precision
)
{
grads_cast_h
=
reinterpret_cast
<
__half
*>
(
grads
);
params_cast_h
=
reinterpret_cast
<
__half
*>
(
_params
);
}
for
(
size_t
t
=
rounded_size
;
t
<
_param_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
_param_size
)
copy_size
=
_param_size
-
t
;
size_t
offset
=
copy_size
+
t
;
if
((
t
/
TILE
)
>=
2
)
{
hipStreamSynchronize
(
_streams
[
_buf_index
]);
}
#pragma omp parallel for
for
(
size_t
k
=
t
;
k
<
offset
;
k
++
)
{
float
grad
=
half_precision
?
(
float
)
grads_cast_h
[
k
]
:
grads
[
k
];
float
param
=
half_precision
?
(
float
)
params_cast_h
[
k
]
:
_params
[
k
];
float
momentum
=
grads
[
k
];
float
variance
=
_exp_avg_sq
[
k
];
if
(
_weight_decay
>
0
)
{
grad
=
param
*
_weight_decay
+
grad
;
}
variance
+=
grad
*
grad
;
grad
=
sqrt
(
variance
);
grad
+=
_eps
;
grad
=
momentum
/
grad
;
param
=
grad
*
step_size
+
param
;
if
(
dev_params
)
_doubled_buffer
[
_buf_index
][
k
-
t
]
=
param
;
if
(
half_precision
)
params_cast_h
[
k
]
=
(
__half
)
param
;
else
_params
[
k
]
=
param
;
// STORE UPDATE TERM TO GRAD'S MEMORY
grads
[
k
]
=
grad
*
step_size
;
_exp_avg_sq
[
k
]
=
variance
;
}
if
(
dev_params
)
{
launch_param_update
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
(
copy_size
),
_streams
[
_buf_index
]);
_buf_index
=
!
_buf_index
;
}
}
}
}
void
Adagrad_Optimizer
::
Step_4
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX
<
4
>
(
&
rounded_size
,
_params
,
grads
,
_exp_avg_sq
,
_param_size
,
dev_params
,
half_precision
);
#endif
if
(
_param_size
>
rounded_size
)
Step_1
((
_params
+
rounded_size
),
(
grads
+
rounded_size
),
(
_exp_avg_sq
+
rounded_size
),
(
_param_size
-
rounded_size
),
(
dev_params
!=
nullptr
?
(
dev_params
+
rounded_size
)
:
dev_params
),
half_precision
);
}
int
create_adagrad_optimizer
(
int
optimizer_id
,
float
alpha
=
1e-2
,
float
eps
=
1e-8
,
float
weight_decay
=
0
,
bool
should_log
=
false
)
{
auto
opt
=
std
::
make_shared
<
Adagrad_Optimizer
>
(
alpha
,
eps
,
weight_decay
);
s_optimizers
[
optimizer_id
]
=
opt
;
if
(
should_log
)
{
std
::
string
avx_type
=
""
;
#if defined(__AVX512__)
avx_type
=
"AVX512"
;
#else
#if defined(__AVX256__)
avx_type
=
"AVX2"
;
#else
avx_type
=
"scalar"
;
#endif
#endif
printf
(
"Adagrad Optimizer #%d is created with %s arithmetic capability.
\n
"
,
optimizer_id
,
avx_type
.
c_str
());
printf
(
"Config: alpha=%f, weight_decay=%f
\n
"
,
alpha
,
weight_decay
);
}
return
0
;
}
void
Adagrad_Optimizer
::
Step_8
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX
<
8
>
(
&
rounded_size
,
_params
,
grads
,
_exp_avg_sq
,
_param_size
,
dev_params
,
half_precision
);
#endif
if
(
_param_size
>
rounded_size
)
Step_4
((
_params
+
rounded_size
),
(
grads
+
rounded_size
),
(
_exp_avg_sq
+
rounded_size
),
(
_param_size
-
rounded_size
),
(
dev_params
!=
nullptr
?
(
dev_params
+
rounded_size
)
:
dev_params
),
half_precision
);
}
int
ds_adagrad_step
(
int
optimizer_id
,
size_t
step
,
float
lr
,
float
epsilon
,
float
weight_decay
,
torch
::
Tensor
&
params
,
torch
::
Tensor
&
grads
,
torch
::
Tensor
&
exp_avg_sq
)
{
auto
params_c
=
params
.
contiguous
();
auto
grads_c
=
grads
.
contiguous
();
auto
exp_avg_sq_c
=
exp_avg_sq
.
contiguous
();
float
*
params_ptr
=
(
float
*
)
params_c
.
data_ptr
();
float
*
grads_ptr
=
(
float
*
)
grads_c
.
data_ptr
();
float
*
exp_avg_sq_ptr
=
(
float
*
)
exp_avg_sq_c
.
data_ptr
();
std
::
shared_ptr
<
Adagrad_Optimizer
>
opt
=
std
::
static_pointer_cast
<
Adagrad_Optimizer
>
(
s_optimizers
[
optimizer_id
]);
opt
->
IncrementStep
(
step
);
opt
->
update_state
(
lr
,
epsilon
,
weight_decay
);
opt
->
Step_8
(
params_ptr
,
grads_ptr
,
exp_avg_sq_ptr
,
params_c
.
size
(
0
));
opt
->
SynchronizeStreams
();
return
0
;
}
int
ds_adagrad_step_plus_copy
(
int
optimizer_id
,
size_t
step
,
float
lr
,
float
epsilon
,
float
weight_decay
,
torch
::
Tensor
&
params
,
torch
::
Tensor
&
grads
,
torch
::
Tensor
&
exp_avg_sq
,
torch
::
Tensor
&
gpu_params
)
{
auto
params_c
=
params
.
contiguous
();
auto
gpu_params_c
=
gpu_params
.
contiguous
();
auto
exp_avg_sq_c
=
exp_avg_sq
.
contiguous
();
auto
grads_c
=
grads
.
contiguous
();
float
*
params_ptr
=
(
float
*
)
params_c
.
data_ptr
();
float
*
grads_ptr
=
(
float
*
)
grads_c
.
data_ptr
();
__half
*
gpu_params_ptr
=
(
__half
*
)
gpu_params_c
.
data_ptr
();
float
*
exp_avg_sq_ptr
=
(
float
*
)
exp_avg_sq_c
.
data_ptr
();
std
::
shared_ptr
<
Adagrad_Optimizer
>
opt
=
std
::
static_pointer_cast
<
Adagrad_Optimizer
>
(
s_optimizers
[
optimizer_id
]);
opt
->
IncrementStep
(
step
);
opt
->
update_state
(
lr
,
epsilon
,
weight_decay
);
opt
->
Step_8
(
params_ptr
,
grads_ptr
,
exp_avg_sq_ptr
,
params_c
.
size
(
0
),
gpu_params_ptr
,
(
params
.
options
().
dtype
()
==
at
::
kHalf
));
opt
->
SynchronizeStreams
();
return
0
;
}
int
destroy_adagrad_optimizer
(
int
optimizer_id
)
{
s_optimizers
.
erase
(
optimizer_id
);
return
0
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"adagrad_update"
,
&
ds_adagrad_step
,
"DeepSpeed CPU Adagrad update (C++)"
);
m
.
def
(
"adagrad_update_copy"
,
&
ds_adagrad_step_plus_copy
,
"DeepSpeed CPU Adagrad update and param copy (C++)"
);
m
.
def
(
"create_adagrad"
,
&
create_adagrad_optimizer
,
"DeepSpeed CPU Adagrad (C++)"
);
m
.
def
(
"destroy_adagrad"
,
&
destroy_adagrad_optimizer
,
"DeepSpeed CPU Adagrad destroy (C++)"
);
}
csrc/adam/cpu_adam_hip.cpp
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#include "cpu_adam_hip.h"
#include <hip/hip_runtime_api.h>
#include <math.h>
#include <omp.h>
#include <torch/extension.h>
#include <iostream>
#include <memory>
#include <type_traits>
#include <unordered_map>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand/hiprand.h"
#include "custom_hip_layers.h"
static
std
::
unordered_map
<
int
,
std
::
shared_ptr
<
void
>>
s_optimizers
;
// C++ interface
void
Adam_Optimizer
::
Step_1
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX
<
1
>
(
&
rounded_size
,
_params
,
grads
,
_exp_avg
,
_exp_avg_sq
,
_param_size
,
dev_params
,
half_precision
);
#endif
if
(
_param_size
>
rounded_size
)
{
float
betta1_minus1
=
1
-
_betta1
;
float
betta2_minus1
=
1
-
_betta2
;
float
step_size
=
-
1
*
_alpha
/
_bias_correction1
;
float
w_decay
=
-
1
*
_alpha
*
_weight_decay
;
__half
*
grads_cast_h
;
__half
*
params_cast_h
;
if
(
half_precision
)
{
grads_cast_h
=
reinterpret_cast
<
__half
*>
(
grads
);
params_cast_h
=
reinterpret_cast
<
__half
*>
(
_params
);
}
for
(
size_t
t
=
rounded_size
;
t
<
_param_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
_param_size
)
copy_size
=
_param_size
-
t
;
size_t
offset
=
copy_size
+
t
;
if
((
t
/
TILE
)
>=
2
)
{
hipStreamSynchronize
(
_streams
[
_buf_index
]);
}
#pragma omp parallel for
for
(
size_t
k
=
t
;
k
<
offset
;
k
++
)
{
float
grad
=
half_precision
?
(
float
)
grads_cast_h
[
k
]
:
grads
[
k
];
float
param
=
half_precision
?
(
float
)
params_cast_h
[
k
]
:
_params
[
k
];
float
momentum
=
_exp_avg
[
k
];
float
variance
=
_exp_avg_sq
[
k
];
if
(
_weight_decay
>
0
&&
!
_adamw_mode
)
{
grad
=
param
*
_weight_decay
+
grad
;
}
momentum
=
momentum
*
_betta1
;
momentum
=
grad
*
betta1_minus1
+
momentum
;
variance
=
variance
*
_betta2
;
grad
=
grad
*
grad
;
variance
=
grad
*
betta2_minus1
+
variance
;
grad
=
sqrt
(
variance
);
grad
=
grad
*
_bias_correction2
+
_eps
;
grad
=
momentum
/
grad
;
if
(
_weight_decay
>
0
&&
_adamw_mode
)
{
param
+=
w_decay
*
param
;
}
param
=
grad
*
step_size
+
param
;
if
(
dev_params
)
_doubled_buffer
[
_buf_index
][
k
-
t
]
=
param
;
if
(
half_precision
)
params_cast_h
[
k
]
=
(
__half
)
param
;
else
_params
[
k
]
=
param
;
_exp_avg
[
k
]
=
momentum
;
_exp_avg_sq
[
k
]
=
variance
;
}
if
(
dev_params
)
{
launch_param_update
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
(
copy_size
),
_streams
[
_buf_index
]);
_buf_index
=
!
_buf_index
;
}
}
}
}
void
Adam_Optimizer
::
Step_4
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX
<
4
>
(
&
rounded_size
,
_params
,
grads
,
_exp_avg
,
_exp_avg_sq
,
_param_size
,
dev_params
,
half_precision
);
#endif
if
(
_param_size
>
rounded_size
)
Step_1
((
_params
+
rounded_size
),
(
grads
+
rounded_size
),
(
_exp_avg
+
rounded_size
),
(
_exp_avg_sq
+
rounded_size
),
(
_param_size
-
rounded_size
),
(
dev_params
!=
nullptr
?
(
dev_params
+
rounded_size
)
:
dev_params
),
half_precision
);
}
int
create_adam_optimizer
(
int
optimizer_id
,
float
alpha
=
1e-3
,
float
betta1
=
0.9
,
float
betta2
=
0.999
,
float
eps
=
1e-8
,
float
weight_decay
=
0
,
bool
adamw_mode
=
true
,
bool
should_log
=
false
)
{
auto
opt
=
std
::
make_shared
<
Adam_Optimizer
>
(
alpha
,
betta1
,
betta2
,
eps
,
weight_decay
,
adamw_mode
);
s_optimizers
[
optimizer_id
]
=
opt
;
if
(
should_log
)
{
std
::
string
avx_type
=
""
;
#if defined(__AVX512__)
avx_type
=
"AVX512"
;
#else
#if defined(__AVX256__)
avx_type
=
"AVX2"
;
#else
avx_type
=
"scalar"
;
#endif
#endif
printf
(
"Adam Optimizer #%d is created with %s arithmetic capability.
\n
"
,
optimizer_id
,
avx_type
.
c_str
());
printf
(
"Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d
\n
"
,
alpha
,
betta1
,
betta2
,
weight_decay
,
(
int
)
adamw_mode
);
}
return
0
;
}
void
Adam_Optimizer
::
Step_8
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX
<
8
>
(
&
rounded_size
,
_params
,
grads
,
_exp_avg
,
_exp_avg_sq
,
_param_size
,
dev_params
,
half_precision
);
#endif
if
(
_param_size
>
rounded_size
)
Step_4
((
_params
+
rounded_size
),
(
grads
+
rounded_size
),
(
_exp_avg
+
rounded_size
),
(
_exp_avg_sq
+
rounded_size
),
(
_param_size
-
rounded_size
),
(
dev_params
!=
nullptr
?
(
dev_params
+
rounded_size
)
:
dev_params
),
half_precision
);
}
int
ds_adam_step
(
int
optimizer_id
,
size_t
step
,
float
lr
,
float
beta1
,
float
beta2
,
float
epsilon
,
float
weight_decay
,
bool
bias_correction
,
torch
::
Tensor
&
params
,
torch
::
Tensor
&
grads
,
torch
::
Tensor
&
exp_avg
,
torch
::
Tensor
&
exp_avg_sq
)
{
auto
params_c
=
params
.
contiguous
();
auto
grads_c
=
grads
.
contiguous
();
auto
exp_avg_c
=
exp_avg
.
contiguous
();
auto
exp_avg_sq_c
=
exp_avg_sq
.
contiguous
();
// assert(params.options().dtype() == grads.options().dtype());
float
*
params_ptr
=
(
float
*
)
params_c
.
data_ptr
();
float
*
grads_ptr
=
(
float
*
)
grads_c
.
data_ptr
();
float
*
exp_avg_ptr
=
(
float
*
)
exp_avg_c
.
data_ptr
();
float
*
exp_avg_sq_ptr
=
(
float
*
)
exp_avg_sq_c
.
data_ptr
();
std
::
shared_ptr
<
Adam_Optimizer
>
opt
=
std
::
static_pointer_cast
<
Adam_Optimizer
>
(
s_optimizers
[
optimizer_id
]);
opt
->
IncrementStep
(
step
,
beta1
,
beta2
);
opt
->
update_state
(
lr
,
epsilon
,
weight_decay
,
bias_correction
);
opt
->
Step_8
(
params_ptr
,
grads_ptr
,
exp_avg_ptr
,
exp_avg_sq_ptr
,
params_c
.
size
(
0
),
nullptr
,
(
params
.
options
().
dtype
()
==
at
::
kHalf
));
opt
->
SynchronizeStreams
();
return
0
;
}
int
ds_adam_step_plus_copy
(
int
optimizer_id
,
size_t
step
,
float
lr
,
float
beta1
,
float
beta2
,
float
epsilon
,
float
weight_decay
,
bool
bias_correction
,
torch
::
Tensor
&
params
,
torch
::
Tensor
&
grads
,
torch
::
Tensor
&
exp_avg
,
torch
::
Tensor
&
exp_avg_sq
,
torch
::
Tensor
&
gpu_params
)
{
auto
params_c
=
params
.
contiguous
();
auto
gpu_params_c
=
gpu_params
.
contiguous
();
auto
exp_avg_c
=
exp_avg
.
contiguous
();
auto
exp_avg_sq_c
=
exp_avg_sq
.
contiguous
();
auto
grads_c
=
grads
.
contiguous
();
float
*
params_ptr
=
(
float
*
)
params_c
.
data_ptr
();
float
*
grads_ptr
=
(
float
*
)
grads_c
.
data_ptr
();
__half
*
gpu_params_ptr
=
(
__half
*
)
gpu_params_c
.
data_ptr
();
float
*
exp_avg_ptr
=
(
float
*
)
exp_avg_c
.
data_ptr
();
float
*
exp_avg_sq_ptr
=
(
float
*
)
exp_avg_sq_c
.
data_ptr
();
std
::
shared_ptr
<
Adam_Optimizer
>
opt
=
std
::
static_pointer_cast
<
Adam_Optimizer
>
(
s_optimizers
[
optimizer_id
]);
opt
->
IncrementStep
(
step
,
beta1
,
beta2
);
opt
->
update_state
(
lr
,
epsilon
,
weight_decay
,
bias_correction
);
opt
->
Step_8
(
params_ptr
,
grads_ptr
,
exp_avg_ptr
,
exp_avg_sq_ptr
,
params_c
.
size
(
0
),
gpu_params_ptr
,
(
params
.
options
().
dtype
()
==
at
::
kHalf
));
opt
->
SynchronizeStreams
();
return
0
;
}
int
destroy_adam_optimizer
(
int
optimizer_id
)
{
s_optimizers
.
erase
(
optimizer_id
);
return
0
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"adam_update"
,
&
ds_adam_step
,
"DeepSpeed CPU Adam update (C++)"
);
m
.
def
(
"adam_update_copy"
,
&
ds_adam_step_plus_copy
,
"DeepSpeed CPU Adam update and param copy (C++)"
);
m
.
def
(
"create_adam"
,
&
create_adam_optimizer
,
"DeepSpeed CPU Adam (C++)"
);
m
.
def
(
"destroy_adam"
,
&
destroy_adam_optimizer
,
"DeepSpeed CPU Adam destroy (C++)"
);
}
csrc/adam/custom_hip_kernel.hip
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
__global__ void param_update_kernel(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < size) { output[id] = (__half)input[id]; }
}
void launch_param_update(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
csrc/adam/multi_tensor_adam.hip
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/Exceptions.h>
// Another possibility:
// #include <torch/all.h>
#include <assert.h>
#include "multi_tensor_apply_hip.cuh"
#include "type_shim_hip.h"
#define BLOCK_SIZE 512
#define ILP 4
typedef enum {
ADAM_MODE_0 = 0, // L2 regularization mode
ADAM_MODE_1 = 1 // Decoupled weight decay mode(AdamW)
} adamMode_t;
using MATH_T = float;
template <typename T>
struct AdamFunctor {
__device__ __forceinline__ void operator()(int chunk_size,
volatile int* noop_gmem,
TensorListMetadata<4>& tl,
const float beta1,
const float beta2,
const float beta1_correction,
const float beta2_correction,
const float epsilon,
const float lr,
adamMode_t mode,
const float decay)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
int tensor_loc = tl.block_to_tensor[blockIdx.x];
// potentially use to pass in list of scalar
// int tensor_num = tl.start_tensor_this_launch + tensor_loc;
int chunk_idx = tl.block_to_chunk[blockIdx.x];
int n = tl.sizes[tensor_loc];
T* g = (T*)tl.addresses[0][tensor_loc];
g += chunk_idx * chunk_size;
T* p = (T*)tl.addresses[1][tensor_loc];
p += chunk_idx * chunk_size;
T* m = (T*)tl.addresses[2][tensor_loc];
m += chunk_idx * chunk_size;
T* v = (T*)tl.addresses[3][tensor_loc];
v += chunk_idx * chunk_size;
n -= chunk_idx * chunk_size;
// see note in multi_tensor_scale_kernel.cu
for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
MATH_T r_g[ILP];
MATH_T r_p[ILP];
MATH_T r_m[ILP];
MATH_T r_v[ILP];
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
int i = i_start + threadIdx.x + ii * blockDim.x;
if (i < n && i < chunk_size) {
r_g[ii] = g[i];
r_p[ii] = p[i];
r_m[ii] = m[i];
r_v[ii] = v[i];
} else {
r_g[ii] = MATH_T(0);
r_p[ii] = MATH_T(0);
r_m[ii] = MATH_T(0);
r_v[ii] = MATH_T(0);
}
}
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
if (mode == ADAM_MODE_0) { // L2
r_g[ii] = r_g[ii] + (decay * r_p[ii]);
r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
MATH_T update = next_m_unbiased / denom;
r_p[ii] = r_p[ii] - (lr * update);
} else { // weight decay
r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
r_p[ii] = r_p[ii] - (lr * update);
}
}
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
int i = i_start + threadIdx.x + ii * blockDim.x;
if (i < n && i < chunk_size) {
p[i] = r_p[ii];
m[i] = r_m[ii];
v[i] = r_v[ii];
}
}
}
}
};
void multi_tensor_adam_cuda(int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists,
const float lr,
const float beta1,
const float beta2,
const float epsilon,
const int step,
const int mode,
const int bias_correction,
const float weight_decay)
{
using namespace at;
// Handle bias correction mode
float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
if (bias_correction == 1) {
bias_correction1 = 1 - ::pow(beta1, step);
bias_correction2 = 1 - ::pow(beta2, step);
}
// Assume single type across p,g,m1,m2 now
DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
0,
"adam",
multi_tensor_apply<4>(BLOCK_SIZE,
chunk_size,
noop_flag,
tensor_lists,
AdamFunctor<scalar_t_0>(),
beta1,
beta2,
bias_correction1,
bias_correction2,
epsilon,
lr,
(adamMode_t)mode,
weight_decay);)
AT_CUDA_CHECK(hipGetLastError());
}
csrc/adam/multi_tensor_apply_hip.cuh
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/Exceptions.h>
#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
#include "compat.h"
#include <assert.h>
// #include <iostream>
// This header is the one-stop shop for all your multi-tensor apply needs.
// TODO: Kernel arg size limit may be <4KB for some other cards (ie Jetson)
constexpr
int
depth_to_max_tensors
[
5
]
=
{
110
,
64
,
48
,
36
,
30
};
constexpr
int
depth_to_max_blocks
[
5
]
=
{
320
,
320
,
320
,
320
,
320
};
template
<
int
n
>
struct
TensorListMetadata
{
void
*
addresses
[
n
][
depth_to_max_tensors
[
n
-
1
]];
int
sizes
[
depth_to_max_tensors
[
n
-
1
]];
unsigned
char
block_to_tensor
[
depth_to_max_blocks
[
n
-
1
]];
int
block_to_chunk
[
depth_to_max_blocks
[
n
-
1
]];
// I fear this needs to be a full int.
int
start_tensor_this_launch
;
};
template
<
typename
T
,
typename
U
,
typename
...
ArgTypes
>
__global__
void
multi_tensor_apply_kernel
(
int
chunk_size
,
volatile
int
*
noop_flag
,
T
tl
,
U
callable
,
ArgTypes
...
args
)
{
// Hand the chunk information to the user-supplied functor to process however it likes.
callable
(
chunk_size
,
noop_flag
,
tl
,
args
...);
}
template
<
int
depth
,
typename
T
,
typename
...
ArgTypes
>
void
multi_tensor_apply
(
int
block_size
,
int
chunk_size
,
const
at
::
Tensor
&
noop_flag
,
const
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>&
tensor_lists
,
T
callable
,
ArgTypes
...
args
)
{
TORCH_CHECK
(
tensor_lists
.
size
()
==
depth
,
"tensor_lists.size() != depth"
);
int
len0
=
tensor_lists
[
0
].
size
();
TORCH_CHECK
(
len0
>
0
,
"tensor_lists[0].size() is not > 0"
);
auto
ref_device
=
tensor_lists
[
0
][
0
].
device
();
TORCH_CHECK
(
ref_device
.
type
()
==
at
::
kCUDA
,
"expected input to be on cuda"
);
for
(
int
l
=
0
;
l
<
tensor_lists
.
size
();
l
++
)
// No range-based for because I need indices
{
TORCH_CHECK
(
tensor_lists
[
l
].
size
()
==
len0
,
"Size mismatch among tensor lists"
);
for
(
int
t
=
0
;
t
<
tensor_lists
[
l
].
size
();
t
++
)
{
// TODO: Print which tensor fails.
bool
contiguous_memory
=
tensor_lists
[
l
][
t
].
is_contiguous
();
#ifdef VERSION_GE_1_5
contiguous_memory
=
(
contiguous_memory
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast
));
#endif
TORCH_CHECK
(
contiguous_memory
,
"A tensor was not contiguous."
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
device
()
==
ref_device
,
"A tensor was not on the same device as the first tensor"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
numel
()
==
tensor_lists
[
0
][
t
].
numel
(),
"Size mismatch"
);
}
}
int
ntensors
=
tensor_lists
[
0
].
size
();
TensorListMetadata
<
depth
>
tl
;
const
at
::
hip
::
OptionalHIPGuardMasqueradingAsCUDA
device_guard
(
device_of
(
tensor_lists
[
0
][
0
]));
auto
stream
=
at
::
hip
::
getCurrentHIPStreamMasqueradingAsCUDA
();
tl
.
start_tensor_this_launch
=
0
;
int
loc_block_info
=
0
;
int
loc_tensor_info
=
0
;
for
(
int
t
=
0
;
t
<
ntensors
;
t
++
)
{
tl
.
sizes
[
loc_tensor_info
]
=
tensor_lists
[
0
][
t
].
numel
();
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
loc_tensor_info
]
=
tensor_lists
[
d
][
t
].
data_ptr
();
loc_tensor_info
++
;
int
chunks_this_tensor
=
(
tensor_lists
[
0
][
t
].
numel
()
+
chunk_size
-
1
)
/
chunk_size
;
for
(
int
chunk
=
0
;
chunk
<
chunks_this_tensor
;
chunk
++
)
{
// std::cout << chunks_this_tensor << std::endl;
tl
.
block_to_tensor
[
loc_block_info
]
=
loc_tensor_info
-
1
;
tl
.
block_to_chunk
[
loc_block_info
]
=
chunk
;
loc_block_info
++
;
bool
tensors_full
=
(
loc_tensor_info
==
depth_to_max_tensors
[
depth
-
1
]
&&
chunk
==
chunks_this_tensor
-
1
);
bool
blocks_full
=
(
loc_block_info
==
depth_to_max_blocks
[
depth
-
1
]);
bool
last_chunk
=
(
t
==
ntensors
-
1
&&
chunk
==
chunks_this_tensor
-
1
);
if
(
tensors_full
||
blocks_full
||
last_chunk
)
{
// using accscalar_t = acc_type<scalar_t, true>;
hipLaunchKernelGGL
((
multi_tensor_apply_kernel
),
dim3
(
loc_block_info
),
dim3
(
block_size
),
0
,
stream
,
chunk_size
,
noop_flag
.
DATA_PTR
<
int
>
(),
tl
,
callable
,
args
...);
AT_CUDA_CHECK
(
hipGetLastError
());
// Reset. The control flow possibilities here make my brain hurt.
loc_block_info
=
0
;
if
(
chunk
==
chunks_this_tensor
-
1
)
{
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
// std::endl;
loc_tensor_info
=
0
;
tl
.
start_tensor_this_launch
=
t
+
1
;
}
else
{
// std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
// std::endl;
tl
.
sizes
[
0
]
=
tl
.
sizes
[
loc_tensor_info
-
1
];
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
0
]
=
tl
.
addresses
[
d
][
loc_tensor_info
-
1
];
loc_tensor_info
=
1
;
tl
.
start_tensor_this_launch
=
t
;
}
}
}
}
}
csrc/common/custom_hip_kernel.hip
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
__global__ void param_update_kernel(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < size) { output[id] = (__half)input[id]; }
}
void launch_param_update(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
__global__ void param_update_kernel_half(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
__half2* output_cast = reinterpret_cast<__half2*>(output);
if (id < size) {
float input_f = input[id];
__half2* input_h = reinterpret_cast<__half2*>(&input_f);
output_cast[id] = *input_h;
}
}
void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
size /= 2;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel_half), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
csrc/includes/Timer_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#ifndef __TIMER_H__
#define __TIMER_H__
#include <hip/hip_runtime.h>
#include <chrono>
#include "hip/hip_runtime.h"
class
GPUTimer
{
hipEvent_t
start
,
stop
;
public:
GPUTimer
()
{
hipEventCreate
(
&
start
);
hipEventCreate
(
&
stop
);
}
~
GPUTimer
()
{
hipEventDestroy
(
start
);
hipEventDestroy
(
stop
);
}
inline
void
Record
()
{
hipEventRecord
(
start
);
}
inline
void
Elapsed
(
float
&
time_elapsed
)
{
hipEventRecord
(
stop
);
hipEventSynchronize
(
stop
);
hipEventElapsedTime
(
&
time_elapsed
,
start
,
stop
);
}
};
class
CPUTimer
{
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
public:
CPUTimer
()
:
start
(
std
::
chrono
::
high_resolution_clock
::
now
())
{}
inline
void
Reset
()
{
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
inline
float
Elapsed
()
{
auto
temp
=
start
;
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
return
(
float
)(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
start
-
temp
).
count
()
/
1e3
);
}
};
#endif
csrc/includes/context_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <ATen/hip/HIPContext.h>
#include <hip/hip_runtime_api.h>
#include <cassert>
#include <iostream>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand/hiprand.h"
#include "gemm_test_hip.h"
#define WARP_SIZE 32
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
#define DS_CUDA_NUM_THREADS 512
#define DS_MAXIMUM_NUM_BLOCKS 262144
inline
int
DS_GET_BLOCKS
(
const
int
N
)
{
return
(
std
::
max
)(
(
std
::
min
)((
N
+
DS_CUDA_NUM_THREADS
-
1
)
/
DS_CUDA_NUM_THREADS
,
DS_MAXIMUM_NUM_BLOCKS
),
// Use at least 1 block, since CUDA does not allow empty block
1
);
}
class
Context
{
public:
Context
()
:
_workspace
(
nullptr
),
_seed
(
42
),
_curr_offset
(
0
)
{
hiprandCreateGenerator
(
&
_gen
,
HIPRAND_RNG_PSEUDO_DEFAULT
);
hiprandSetPseudoRandomGeneratorSeed
(
_gen
,
123
);
if
(
rocblas_create_handle
(
&
_cublasHandle
)
!=
rocblas_status_success
)
{
auto
message
=
std
::
string
(
"Fail to create cublas handle."
);
std
::
cerr
<<
message
<<
std
::
endl
;
throw
std
::
runtime_error
(
message
);
}
}
virtual
~
Context
()
{
rocblas_destroy_handle
(
_cublasHandle
);
hipFree
(
_workspace
);
}
static
Context
&
Instance
()
{
static
Context
_ctx
;
return
_ctx
;
}
void
SetWorkSpace
(
void
*
workspace
)
{
if
(
!
workspace
)
{
throw
std
::
runtime_error
(
"Workspace is null."
);
}
_workspace
=
workspace
;
}
void
*
GetWorkSpace
()
{
return
_workspace
;
}
hiprandGenerator_t
&
GetRandGenerator
()
{
return
_gen
;
}
hipStream_t
GetCurrentStream
()
{
// get current pytorch stream.
hipStream_t
stream
=
at
::
hip
::
getCurrentHIPStreamMasqueradingAsCUDA
();
return
stream
;
}
hipStream_t
GetNewStream
()
{
return
at
::
hip
::
getStreamFromPoolMasqueradingAsCUDA
();
}
rocblas_handle
GetCublasHandle
()
{
return
_cublasHandle
;
}
std
::
pair
<
uint64_t
,
uint64_t
>
IncrementOffset
(
uint64_t
offset_inc
)
{
uint64_t
offset
=
_curr_offset
;
_curr_offset
+=
offset_inc
;
return
std
::
pair
<
uint64_t
,
uint64_t
>
(
_seed
,
offset
);
}
void
SetSeed
(
uint64_t
new_seed
)
{
_seed
=
new_seed
;
}
void
TestGemmFP16
(
bool
test_gemm
,
int
batch_size
,
int
seq_len
,
int
head_num
,
int
size_per_head
)
{
// avoid rerun.
if
(
_gemm_algos
.
size
()
>
0
)
return
;
if
(
test_gemm
)
{
rocblas_handle
handle
=
GetCublasHandle
();
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_qkv_fw
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
head_num
*
size_per_head
,
// N
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_inter
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
4
*
head_num
*
size_per_head
,
// N
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_output
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
head_num
*
size_per_head
,
// N
4
*
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
StridedGemmTest
<
__half
>>
test_attn_scores
(
new
StridedGemmTest
<
__half
>
(
batch_size
*
head_num
,
// batch
seq_len
,
// M
seq_len
,
// N
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
StridedGemmTest
<
__half
>>
test_attn_context
(
new
StridedGemmTest
<
__half
>
(
batch_size
*
head_num
,
// batch
size_per_head
,
// M
seq_len
,
// N
seq_len
,
// K
rocblas_operation_none
,
rocblas_operation_none
,
handle
));
_gemm_algos
.
push_back
(
test_qkv_fw
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_inter
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_output
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_attn_scores
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_attn_context
->
TestAlgo
(
100
));
}
else
{
// Use default algo.
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
}
}
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
GetGemmAlgos
()
const
{
return
_gemm_algos
;
}
private:
hiprandGenerator_t
_gen
;
rocblas_handle
_cublasHandle
;
void
*
_workspace
;
uint64_t
_seed
;
uint64_t
_curr_offset
;
std
::
vector
<
std
::
array
<
int
,
3
>>
_gemm_algos
;
};
csrc/includes/cpu_adagrad_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <hip/hip_fp16.h>
#include <hip/hip_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class
Adagrad_Optimizer
{
public:
Adagrad_Optimizer
(
float
alpha
=
1e-2
,
float
eps
=
1e-8
,
float
weight_decay
=
0
)
:
_alpha
(
alpha
),
_eps
(
eps
),
_weight_decay
(
weight_decay
),
_buf_index
(
false
)
{
hipHostMalloc
((
void
**
)
_doubled_buffer
,
TILE
*
sizeof
(
float
));
hipHostMalloc
((
void
**
)(
_doubled_buffer
+
1
),
TILE
*
sizeof
(
float
));
_streams
[
0
]
=
Context
::
Instance
().
GetCurrentStream
();
_streams
[
1
]
=
Context
::
Instance
().
GetNewStream
();
}
~
Adagrad_Optimizer
()
{
hipHostFree
(
_doubled_buffer
[
0
]);
hipHostFree
(
_doubled_buffer
[
1
]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
,
bool
half_precision
=
false
);
#endif
STEP
(
1
)
STEP
(
4
)
STEP
(
8
)
inline
void
SynchronizeStreams
()
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
hipStreamSynchronize
(
_streams
[
i
]);
}
inline
void
IncrementStep
(
size_t
step
)
{
_step
++
;
if
(
_step
!=
step
)
{
_step
=
step
;
}
}
inline
void
update_state
(
float
lr
,
float
epsilon
,
float
weight_decay
)
{
_alpha
=
lr
;
_eps
=
epsilon
;
_weight_decay
=
weight_decay
;
}
private:
float
_alpha
;
float
_eps
;
float
_weight_decay
;
float
_betta1_t
;
float
_betta2_t
;
size_t
_step
;
float
*
_doubled_buffer
[
2
];
bool
_buf_index
;
hipStream_t
_streams
[
2
];
};
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Adagrad_Optimizer
::
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
new_rounded_size
=
0
;
AVX_Data
eps_4
;
eps_4
.
data
=
SIMD_SET
(
_eps
);
float
step_size
=
-
1
*
_alpha
;
AVX_Data
step_size_4
;
step_size_4
.
data
=
SIMD_SET
(
step_size
);
AVX_Data
weight_decay4
;
if
(
_weight_decay
>
0
)
weight_decay4
.
data
=
SIMD_SET
(
_weight_decay
);
new_rounded_size
=
ROUND_DOWN
(
_param_size
,
SIMD_WIDTH
*
span
);
for
(
size_t
t
=
0
;
t
<
new_rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
new_rounded_size
)
copy_size
=
new_rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
if
((
t
/
TILE
)
>=
2
)
{
hipStreamSynchronize
(
_streams
[
_buf_index
]);
}
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
SIMD_WIDTH
*
span
)
{
AVX_Data
grad_4
[
span
];
simd_load
<
span
>
(
grad_4
,
grads
+
i
,
half_precision
);
AVX_Data
momentum_4
[
span
];
simd_load
<
span
>
(
momentum_4
,
grads
+
i
,
false
);
AVX_Data
variance_4
[
span
];
simd_load
<
span
>
(
variance_4
,
_exp_avg_sq
+
i
,
false
);
AVX_Data
param_4
[
span
];
simd_load
<
span
>
(
param_4
,
_params
+
i
,
half_precision
);
if
(
_weight_decay
>
0
)
{
simd_fma
<
span
>
(
grad_4
,
param_4
,
weight_decay4
,
grad_4
);
}
simd_fma
<
span
>
(
variance_4
,
grad_4
,
grad_4
,
variance_4
);
simd_sqrt
<
span
>
(
grad_4
,
variance_4
);
simd_add
<
span
>
(
grad_4
,
grad_4
,
eps_4
);
simd_div
<
span
>
(
grad_4
,
momentum_4
,
grad_4
);
simd_fma
<
span
>
(
param_4
,
grad_4
,
step_size_4
,
param_4
);
simd_store
<
span
>
(
_params
+
i
,
param_4
,
half_precision
);
if
(
dev_params
)
{
simd_store
<
span
>
(
_doubled_buffer
[
_buf_index
]
+
(
i
-
t
),
param_4
,
half_precision
);
}
simd_store
<
span
>
(
_exp_avg_sq
+
i
,
variance_4
,
false
);
}
if
(
dev_params
)
{
if
(
half_precision
)
launch_param_update_half
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
else
launch_param_update
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
_buf_index
=
!
_buf_index
;
}
}
*
rounded_size
=
new_rounded_size
;
}
#endif
csrc/includes/cpu_adam_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <hip/hip_fp16.h>
#include <hip/hip_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class
Adam_Optimizer
{
public:
Adam_Optimizer
(
float
alpha
=
1e-3
,
float
betta1
=
0.9
,
float
betta2
=
0.999
,
float
eps
=
1e-8
,
float
weight_decay
=
0
,
bool
adamw_mode
=
true
)
:
_alpha
(
alpha
),
_betta1
(
betta1
),
_betta2
(
betta2
),
_eps
(
eps
),
_weight_decay
(
weight_decay
),
_betta1_t
(
1.0
),
_betta2_t
(
1.0
),
_step
(
0
),
_buf_index
(
false
),
_adamw_mode
(
adamw_mode
)
{
hipHostMalloc
((
void
**
)
_doubled_buffer
,
TILE
*
sizeof
(
float
));
hipHostMalloc
((
void
**
)(
_doubled_buffer
+
1
),
TILE
*
sizeof
(
float
));
_streams
[
0
]
=
Context
::
Instance
().
GetCurrentStream
();
_streams
[
1
]
=
Context
::
Instance
().
GetNewStream
();
}
~
Adam_Optimizer
()
{
hipHostFree
(
_doubled_buffer
[
0
]);
hipHostFree
(
_doubled_buffer
[
1
]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
,
bool
half_precision
=
false
);
#endif
STEP
(
1
)
STEP
(
4
)
STEP
(
8
)
inline
void
SynchronizeStreams
()
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
hipStreamSynchronize
(
_streams
[
i
]);
}
inline
void
IncrementStep
(
size_t
step
,
float
beta1
,
float
beta2
)
{
if
(
beta1
!=
_betta1
||
beta2
!=
_betta2
)
{
_step
=
step
;
_betta1
=
beta1
;
_betta2
=
beta2
;
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
}
else
{
_step
++
;
if
(
_step
!=
step
)
{
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
_step
=
step
;
}
else
{
_betta1_t
*=
_betta1
;
_betta2_t
*=
_betta2
;
}
}
}
inline
void
update_state
(
float
lr
,
float
epsilon
,
float
weight_decay
,
bool
bias_correction
)
{
_alpha
=
lr
;
_eps
=
epsilon
;
_weight_decay
=
weight_decay
;
_bias_correction1
=
1.0
f
;
_bias_correction2
=
1.0
f
;
if
(
bias_correction
==
1
)
{
_bias_correction1
=
1
-
_betta1_t
;
_bias_correction2
=
1
/
sqrt
(
1
-
_betta2_t
);
}
}
private:
float
_alpha
;
float
_betta1
;
float
_betta2
;
float
_eps
;
float
_weight_decay
;
float
_betta1_t
;
float
_betta2_t
;
size_t
_step
;
float
_bias_correction1
;
float
_bias_correction2
;
float
*
_doubled_buffer
[
2
];
bool
_buf_index
;
bool
_adamw_mode
;
hipStream_t
_streams
[
2
];
};
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Adam_Optimizer
::
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
new_rounded_size
=
0
;
AVX_Data
betta1_4
;
betta1_4
.
data
=
SIMD_SET
(
_betta1
);
AVX_Data
betta2_4
;
betta2_4
.
data
=
SIMD_SET
(
_betta2
);
float
betta1_minus1
=
1
-
_betta1
;
float
betta2_minus1
=
1
-
_betta2
;
AVX_Data
betta1_minus1_4
;
betta1_minus1_4
.
data
=
SIMD_SET
(
betta1_minus1
);
AVX_Data
betta2_minus1_4
;
betta2_minus1_4
.
data
=
SIMD_SET
(
betta2_minus1
);
AVX_Data
bias2_sqrt
;
bias2_sqrt
.
data
=
SIMD_SET
(
_bias_correction2
);
AVX_Data
eps_4
;
eps_4
.
data
=
SIMD_SET
(
_eps
);
float
step_size
=
-
1
*
_alpha
/
_bias_correction1
;
AVX_Data
step_size_4
;
step_size_4
.
data
=
SIMD_SET
(
step_size
);
float
w_decay
=
-
1
*
_alpha
*
_weight_decay
;
AVX_Data
weight_decay4
;
if
(
_weight_decay
>
0
)
weight_decay4
.
data
=
(
_adamw_mode
?
SIMD_SET
(
w_decay
)
:
SIMD_SET
(
_weight_decay
));
new_rounded_size
=
ROUND_DOWN
(
_param_size
,
SIMD_WIDTH
*
span
);
for
(
size_t
t
=
0
;
t
<
new_rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
new_rounded_size
)
copy_size
=
new_rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
if
((
t
/
TILE
)
>=
2
)
{
hipStreamSynchronize
(
_streams
[
_buf_index
]);
}
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
SIMD_WIDTH
*
span
)
{
AVX_Data
grad_4
[
span
];
simd_load
<
span
>
(
grad_4
,
grads
+
i
,
half_precision
);
AVX_Data
momentum_4
[
span
];
simd_load
<
span
>
(
momentum_4
,
_exp_avg
+
i
,
false
);
AVX_Data
variance_4
[
span
];
simd_load
<
span
>
(
variance_4
,
_exp_avg_sq
+
i
,
false
);
AVX_Data
param_4
[
span
];
simd_load
<
span
>
(
param_4
,
_params
+
i
,
half_precision
);
if
(
_weight_decay
>
0
&&
!
_adamw_mode
)
{
simd_fma
<
span
>
(
grad_4
,
param_4
,
weight_decay4
,
grad_4
);
}
simd_mul
<
span
>
(
momentum_4
,
momentum_4
,
betta1_4
);
simd_fma
<
span
>
(
momentum_4
,
grad_4
,
betta1_minus1_4
,
momentum_4
);
simd_mul
<
span
>
(
variance_4
,
variance_4
,
betta2_4
);
simd_mul
<
span
>
(
grad_4
,
grad_4
,
grad_4
);
simd_fma
<
span
>
(
variance_4
,
grad_4
,
betta2_minus1_4
,
variance_4
);
simd_sqrt
<
span
>
(
grad_4
,
variance_4
);
simd_fma
<
span
>
(
grad_4
,
grad_4
,
bias2_sqrt
,
eps_4
);
simd_div
<
span
>
(
grad_4
,
momentum_4
,
grad_4
);
if
(
_weight_decay
>
0
&&
_adamw_mode
)
{
simd_fma
<
span
>
(
param_4
,
param_4
,
weight_decay4
,
param_4
);
}
simd_fma
<
span
>
(
param_4
,
grad_4
,
step_size_4
,
param_4
);
simd_store
<
span
>
(
_params
+
i
,
param_4
,
half_precision
);
if
(
dev_params
)
{
simd_store
<
span
>
(
_doubled_buffer
[
_buf_index
]
+
(
i
-
t
),
param_4
,
half_precision
);
}
simd_store
<
span
>
(
_exp_avg
+
i
,
momentum_4
,
false
);
simd_store
<
span
>
(
_exp_avg_sq
+
i
,
variance_4
,
false
);
}
if
(
dev_params
)
{
if
(
half_precision
)
launch_param_update_half
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
else
launch_param_update
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
_buf_index
=
!
_buf_index
;
}
}
*
rounded_size
=
new_rounded_size
;
}
#endif
csrc/includes/cublas_wrappers_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <assert.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#ifndef __HIP_PLATFORM_HCC__
#include <mma.h>
#endif
#include <stdio.h>
int
cublas_gemm_ex
(
rocblas_handle
handle
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
#endif
int
cublas_gemm_ex
(
rocblas_handle
handle
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
#endif
int
cublas_strided_batched_gemm
(
rocblas_handle
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
rocblas_operation
op_A
,
rocblas_operation
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
#endif
int
cublas_strided_batched_gemm
(
rocblas_handle
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
rocblas_operation
op_A
,
rocblas_operation
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
#endif
csrc/includes/custom_hip_layers.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <hiprand/hiprand_kernel.h>
#include "context_hip.h"
#include "cublas_wrappers_hip.h"
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define MAX_THREADS 1024
#define THREADS 256
#define MAX_THREAD_STRIDE 32
#define TILE_DIM 32
// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
#define MAX_THREAD_ITERATIONS 8 // Maximum 8K
#define MAX_WARP_NUM 32
#define MAX_REGISTERS 256
#define MAX_REG 256
#define WARP_SIZE_BITS 5
template
<
typename
T
>
void
launch_quantize_kernel
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_sr_quantize_kernel
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_quantize_kernel_asym
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_sr_quantize_kernel_asym
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
hipStream_t
stream
);
// Fused bias add with gelu activation
template
<
typename
T
>
void
launch_bias_gelu
(
const
T
*
input
,
const
T
*
bias
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_gelu
(
const
T
*
input
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_d_gelu
(
T
*
d_output
,
const
T
*
input
,
const
T
*
bias
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
// Custom fused bias add with layer normalization
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
,
T
*
means
);
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward_nreversible
(
const
T
*
out_grad
,
const
T
*
vals
,
const
T
*
out_grad_trans
,
const
T
*
vals_trans
,
const
T
*
means
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
Transpose
(
const
T
*
inp_mat
,
T
*
out_mat
,
int
rows
,
int
cols
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward_v2
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
hipStream_t
stream
);
// Custom softmax with scaling and attention mask addition
template
<
typename
T
>
void
launch_attn_softmax
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
heads
,
int
sequence_length
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_transform_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
hipStream_t
stream
);
// Custom bias add
template
<
typename
T
>
void
launch_bias_add_transform_0213
(
T
*
outputs
,
const
T
*
vals
,
const
T
*
bias
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
hipStream_t
stream
,
int
trans_count
);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template
<
typename
T
>
void
launch_transform4d_0213
(
T
*
out
,
const
T
*
in
,
int
batch_size
,
int
heads
,
int
seq_length
,
int
hidden_dim
,
hipStream_t
stream
,
int
trans_count
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
int
dim
,
float
ratio
,
hipStream_t
stream
,
bool
bwd
=
false
);
template
<
typename
T
>
void
launch_dropout
(
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_fuse_transpose_bias_kernel
(
const
T
*
inp
,
T
*
out
,
int
rows
,
int
cols
,
hipStream_t
stream
);
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
hipStream_t
stream
);
void
launch_param_update_half
(
const
float
*
input
,
__half
*
output
,
int
size
,
hipStream_t
stream
);
csrc/includes/dropout_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
template
<
typename
T
>
class
Dropout
{
public:
struct
Config
{
float
ratio
;
uint32_t
dim
;
bool
training
;
Config
(
float
r
,
uint32_t
d
)
:
ratio
(
r
),
dim
(
d
),
training
(
true
)
{}
float
RATIO
()
const
{
return
training
?
ratio
:
0.0
;
}
inline
void
SetDim
(
uint32_t
d
)
{
dim
=
d
;
}
};
Dropout
(
const
Config
&
config
)
:
_config
(
config
),
_mask
(
nullptr
)
{}
virtual
~
Dropout
()
{}
void
Forward
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
hipStream_t
stream
,
bool
bwd
=
false
)
{
launch_dropout
<
T
>
(
out
,
vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
dim
,
_config
.
RATIO
(),
stream
,
bwd
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
vals
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_dropout
<
T
>
(
vals
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_dropout
<
T
>
(
out
,
vals
,
residual
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals
,
hipStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals_out
,
const
T
*
d_vals
,
hipStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals_out
,
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
bool
HasDropout
()
const
{
return
_config
.
RATIO
()
>
0.0
;
}
void
SetTrainingMode
(
bool
training
)
{
_config
.
training
=
training
;
}
void
SetMask
(
uint8_t
*
mask
)
{
if
(
!
mask
)
{
throw
std
::
runtime_error
(
"Dropout mask is null."
);
}
_mask
=
mask
;
}
Config
GetConfig
()
const
{
return
_config
;
}
inline
void
SetDimension
(
uint32_t
dim
)
{
_config
.
SetDim
(
dim
);
}
private:
uint8_t
*
_mask
;
Config
_config
;
};
csrc/includes/ds_transformer_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime_api.h>
#include <hiprand/hiprand.h>
#include <memory>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "dropout_hip.h"
#include "feed_forward_hip.h"
#include "gelu_hip.h"
#include "general_kernels_hip.h"
#include "normalize_layer_hip.h"
#include "softmax_hip.h"
#include "strided_batch_gemm_hip.h"
struct
BertGemmAlgos
{
int
m_gemm_qkv_algo
;
int
m_gemm_inter_algo
;
int
m_gemm_output_algo
;
int
m_gemm_batch1_algo
;
int
m_gemm_batch2_algo
;
BertGemmAlgos
()
:
m_gemm_qkv_algo
(
-
1
),
m_gemm_inter_algo
(
-
1
),
m_gemm_output_algo
(
-
1
),
m_gemm_batch1_algo
(
-
1
),
m_gemm_batch2_algo
(
-
1
)
{
}
};
template
<
typename
T
>
class
BertTransformerLayer
{
public:
BertTransformerLayer
(
unsigned
layer_id
,
unsigned
batch_size
,
unsigned
hidden_size
,
unsigned
num_heads
,
unsigned
intermediate_size
,
unsigned
seq_length
,
float
attn_dropout_ratio
,
float
hidden_output_dropout_ratio
,
float
layer_norm_eps
,
bool
pre_or_postLayerNorm
,
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
gemm_algos
,
bool
attn_dropout_checkpoint
,
bool
normalize_invertible
,
bool
gelu_checkpoint
,
bool
stochastic_mode
);
virtual
~
BertTransformerLayer
();
void
Forward
(
unsigned
bsz
,
const
T
*
input_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_qkvb_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_ob_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
output_b_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
out_ptr
,
T
*
inp_norm_ptr
,
T
*
q_tf_ptr
,
T
*
k_tf_ptr
,
T
*
v_tf_ptr
,
T
*
softmax_output_ptr
,
T
*
ctx_bufB_ptr
,
T
*
attn_o_inp_ptr
,
T
*
add_res_ptr
,
T
*
ff1_inp_ptr
,
T
*
gelu_inp_ptr
,
T
*
ff2_inp_ptr
);
void
Backward
(
unsigned
bsz
,
const
T
*
grad_output_ptr
,
const
T
*
input_ptr
,
const
T
*
output_ptr
,
const
T
*
inp_norm_ptr
,
const
T
*
q_tf_ptr
,
const
T
*
k_tf_ptr
,
const
T
*
v_tf_ptr
,
const
T
*
softmax_output_ptr
,
const
T
*
ctx_bufB_ptr
,
const
T
*
attn_o_inp_ptr
,
const
T
*
add_res_ptr
,
const
T
*
ff1_inp_ptr
,
const
T
*
gelu_inp_ptr
,
const
T
*
ff2_inp_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
grad_input_ptr
,
T
*
grad_attn_qkvw_ptr
,
T
*
grad_attn_qkvb_ptr
,
T
*
grad_attn_ow_ptr
,
T
*
grad_attn_ob_ptr
,
T
*
grad_attn_nw_ptr
,
T
*
grad_attn_nb_ptr
,
T
*
grad_inter_w_ptr
,
T
*
grad_inter_b_ptr
,
T
*
grad_output_w_ptr
,
T
*
grad_output_b_ptr
,
T
*
grad_norm_w_ptr
,
T
*
grad_norm_b_ptr
);
void
SetIntermediateBuffers
(
uint8_t
*
attn_prob_dropout_mask_ptr
,
uint8_t
*
attn_output_dropout_mask_ptr
,
uint8_t
*
layer_output_dropout_mask_ptr
,
T
*
layer_norm_var
,
T
*
layer_norm_mean
,
T
*
attn_layer_norm_var
,
T
*
attn_layer_norm_mean
);
inline
unsigned
GetBatchSize
()
const
{
return
_batch_size
;
}
inline
unsigned
GetNumHeads
()
const
{
return
_heads
;
}
inline
unsigned
GetSeqLength
()
const
{
return
_seq_length
;
}
inline
unsigned
GetIntermediateSize
()
const
{
return
_intermediate_size
;
}
void
SetSeqLength
(
unsigned
seq_len
);
inline
unsigned
GetHiddenSize
()
const
{
return
_hidden_size
;
}
void
SetTrainingMode
(
bool
training
);
inline
bool
IsTrainingMode
()
const
{
return
_training
;
}
inline
bool
GeluCheckpoint
()
const
{
return
_gelu_checkpoint
;
}
private:
void
Initialize
();
size_t
getWorkspaceSize
(
int
maxBatchSize
)
const
;
// Params
unsigned
_layer_id
;
unsigned
_batch_size
;
unsigned
_hidden_size
;
unsigned
_heads
;
unsigned
_size_per_head
;
unsigned
_intermediate_size
;
unsigned
_seq_length
;
bool
_pre_or_postLayerNorm
;
rocblas_handle
_cublasHandle
;
hipStream_t
_stream
;
// layers
FeedForward
<
T
>
_qkv_linear
;
FeedForward
<
T
>
_attn_out_linear
;
Normalize_Layer
<
T
>
_attn_layer_norm
;
Normalize_Layer
<
T
>
_layer_norm
;
Normalize_Layer
<
T
>*
_last_normalize
;
FeedForward
<
T
>
_ff1
,
_ff2
;
Softmax
<
T
>
_softmax
;
Gelu
<
T
>
_gelu
;
Dropout
<
T
>
_attn_prob_dropout
;
Dropout
<
T
>
_attn_output_dropout
;
Dropout
<
T
>
_layer_output_dropout
;
StridedBatchGemm
<
T
>
_attn_scores
;
StridedBatchGemm
<
T
>
_attn_context
;
bool
_training
;
// Memory saving flags
bool
_attn_dropout_checkpoint
;
bool
_normalize_invertible
;
bool
_gelu_checkpoint
;
// High Performance flags
bool
_stochastic_mode
;
};
csrc/includes/feed_forward_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#ifndef __FEEDFORWARD_H__
#define __FEEDFORWARD_H__
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_hip_layers.h"
template
<
typename
T
>
class
FeedForward
{
public:
struct
Config
{
int
batchSize
,
outputSize
;
int
inputSize
;
std
::
array
<
int
,
3
>
gemm_algos
;
Config
(
int
batch
,
int
outputs
,
int
inputs
,
const
std
::
array
<
int
,
3
>&
algos
)
:
batchSize
(
batch
),
outputSize
(
outputs
),
inputSize
(
inputs
),
gemm_algos
(
algos
)
{
}
};
FeedForward
(
Config
config
)
:
config_
(
config
)
{}
~
FeedForward
()
{}
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
out
,
rocblas_handle
&
_cublasHandle
)
{
float
alpha
=
T
(
1.
);
float
beta
=
T
(
0.
);
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_transpose
,
rocblas_operation_none
,
config_
.
outputSize
,
bsz
,
config_
.
inputSize
,
&
alpha
,
&
beta
,
weights
,
input_ptr
,
out
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
(
config_
.
gemm_algos
[
0
]));
#else
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
0
]));
#endif
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
weights_grad
,
T
*
bias_grad
,
rocblas_handle
&
_cublasHandle
,
hipStream_t
&
stream
,
T
*
inp_grad_out
=
nullptr
,
T
*
out_grad_trans_out
=
nullptr
)
{
float
alpha
=
(
T
)
1.0
,
beta
=
(
T
)
0.0
;
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_none
,
rocblas_operation_transpose
,
config_
.
inputSize
,
config_
.
outputSize
,
bsz
,
&
alpha
,
&
beta
,
input_ptr
,
out_grad
,
weights_grad
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
(
config_
.
gemm_algos
[
1
]));
#else
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
1
]));
#endif
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_none
,
rocblas_operation_none
,
config_
.
inputSize
,
bsz
,
config_
.
outputSize
,
&
alpha
,
&
beta
,
weights
,
out_grad
,
inp_grad_out
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
(
config_
.
gemm_algos
[
2
]));
#else
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
2
]));
#endif
launch_fuse_transpose_bias_kernel
<
T
>
(
out_grad
,
bias_grad
,
bsz
,
config_
.
outputSize
,
stream
);
}
private:
Config
config_
;
};
#endif
csrc/includes/gelu_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_hip_layers.h"
template
<
typename
T
>
class
Gelu
{
public:
struct
Config
{
uint32_t
intermediate_size
;
Config
(
uint32_t
inter_size
)
:
intermediate_size
(
inter_size
)
{}
};
Gelu
(
const
Config
&
config
)
:
_config
(
config
)
{}
virtual
~
Gelu
()
{}
void
ForwardWithBiasAdd
(
int
bsz
,
const
T
*
input_buf
,
const
T
*
bias
,
T
*
output
,
hipStream_t
stream
)
{
launch_bias_gelu
<
T
>
(
input_buf
,
bias
,
output
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_output
,
const
T
*
input_buf
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_d_gelu
<
T
>
(
d_output
,
input_buf
,
bias
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
private:
Config
_config
;
};
csrc/includes/gemm_test_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_fp16.h>
#ifndef __HIP_PLATFORM_HCC__
#include <cuda_profiler_api.h>
#endif
#include <array>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <limits>
#include <memory>
#include "StopWatch.h"
#include "cublas_wrappers_hip.h"
template
<
typename
T
>
void
check
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
)
{
if
(
result
)
{
std
::
cout
<<
(
std
::
string
(
"CUDA runtime error: "
)
+
+
file
+
":"
+
std
::
to_string
(
line
)
+
"
\n
"
);
}
}
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
template
<
typename
T
>
class
GemmTest
{
public:
GemmTest
(
int
m
,
int
n
,
int
k
,
rocblas_operation
ta
,
rocblas_operation
tb
,
rocblas_handle
h
)
:
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
hipMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
));
}
~
GemmTest
()
{
check_cuda_error
(
hipFree
(
A
));
check_cuda_error
(
hipFree
(
B
));
check_cuda_error
(
hipFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_transpose
,
rocblas_operation_none
,
N
,
M
,
K
,
&
alpha
,
&
beta
,
B
,
A
,
C
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_none
,
rocblas_operation_transpose
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_none
,
rocblas_operation_none
,
K
,
M
,
N
,
&
alpha
,
&
beta
,
B
,
C
,
A
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
#ifdef __HIP_PLATFORM_HCC__
for
(
int
algo
=
(
int
)
rocblas_gemm_algo_standard
;
algo
<=
(
int
)
rocblas_gemm_algo_standard
;
#else
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
#endif
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
M
,
N
,
K
;
rocblas_handle
handle
;
rocblas_operation
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
template
<
typename
T
>
class
StridedGemmTest
{
public:
StridedGemmTest
(
int
b
,
int
m
,
int
n
,
int
k
,
rocblas_operation
ta
,
rocblas_operation
tb
,
rocblas_handle
h
)
:
bsz
(
b
),
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
hipMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
*
bsz
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
*
bsz
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
*
bsz
));
}
~
StridedGemmTest
()
{
check_cuda_error
(
hipFree
(
A
));
check_cuda_error
(
hipFree
(
B
));
check_cuda_error
(
hipFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
stride_a
=
M
*
K
;
int
stride_b
=
N
*
K
;
int
stride_c
=
M
*
N
;
cublas_strided_batched_gemm
(
handle
,
M
,
N
,
K
,
&
alpha
,
&
beta
,
A
,
B
,
C
,
transa
,
transb
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
mb
=
(
transa
==
rocblas_operation_transpose
?
K
:
M
);
int
kb
=
(
transa
==
rocblas_operation_transpose
?
M
:
K
);
int
stride_a
=
mb
*
N
;
int
stride_b
=
N
*
kb
;
int
stride_c
=
M
*
K
;
// B need to transpose.
rocblas_operation
op_b
=
(
transb
==
rocblas_operation_transpose
?
rocblas_operation_none
:
rocblas_operation_transpose
);
// Calculate d_A.
cublas_strided_batched_gemm
(
handle
,
mb
,
kb
,
N
,
&
alpha
,
&
beta
,
(
transa
==
rocblas_operation_transpose
?
B
:
C
),
(
transa
==
rocblas_operation_transpose
?
C
:
B
),
A
,
rocblas_operation_none
,
op_b
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
// A need to transpose.
rocblas_operation
op_a
=
(
transa
==
rocblas_operation_transpose
?
rocblas_operation_none
:
rocblas_operation_transpose
);
int
stride_a
=
M
*
K
;
int
stride_b
=
M
*
N
;
int
stride_c
=
N
*
K
;
// Calculate d_B.
cublas_strided_batched_gemm
(
handle
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
op_a
,
rocblas_operation_none
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
#ifdef __HIP_PLATFORM_HCC__
for
(
int
algo
=
(
int
)
rocblas_gemm_algo_standard
;
algo
<=
(
int
)
rocblas_gemm_algo_standard
;
#else
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
#endif
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
bsz
,
M
,
N
,
K
;
rocblas_handle
handle
;
rocblas_operation
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
csrc/includes/general_kernels_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
#endif
#include <hiprand/hiprand_kernel.h>
#include "context_hip.h"
#include "cublas_wrappers_hip.h"
#define THREADS 256
#define TILE_DIM 32
#define minus_infinity -1 * std::numeric_limits<float>::infinity()
#define FINAL_MASK 0xffffffff
template
<
typename
T
>
void
launch_fused_add2
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
hipStream_t
&
stream
);
template
<
typename
T
>
void
launch_fused_add4
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
const
T
*
inp3
,
const
T
*
inp4
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
hipStream_t
&
stream
);
template
<
typename
T
>
void
launch_fused_add3
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
const
T
*
inp3
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
hipStream_t
&
stream
);
csrc/includes/normalize_layer_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <fstream>
#include "custom_hip_layers.h"
using
namespace
std
;
template
<
typename
T
>
class
Normalize_Layer
{
public:
struct
Config
{
uint32_t
batchSize
;
uint32_t
seqLength
;
uint32_t
hiddenDim
;
float
epsilon
;
bool
training
;
bool
useMean
;
Config
(
uint32_t
batch
,
uint32_t
seq
,
uint32_t
h
,
float
epsilon
=
1e-12
,
bool
training
=
true
,
bool
useMean
=
true
)
:
batchSize
(
batch
),
seqLength
(
seq
),
hiddenDim
(
h
),
epsilon
(
epsilon
),
training
(
training
),
useMean
(
useMean
)
{
}
};
Normalize_Layer
(
Config
config
)
:
config_
(
config
),
vars
(
nullptr
),
means
(
nullptr
),
vals_hat
(
nullptr
)
{
}
~
Normalize_Layer
()
{}
void
ForwardCheckpoint
(
int
bsz
,
// batch * seq
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
betta
,
hipStream_t
&
stream
,
bool
preLayerNorm
=
false
)
{
launch_bias_residual_layer_norm
(
vals
,
residual
,
gamma
,
betta
,
config_
.
epsilon
,
bsz
,
config_
.
hiddenDim
,
stream
,
preLayerNorm
,
config_
.
training
,
vars
,
means
);
}
void
Forward
(
int
bsz
,
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
betta
,
hipStream_t
&
stream
,
bool
preLayerNorm
=
false
)
{
launch_bias_residual_layer_norm
(
vals
,
residual
,
gamma
,
betta
,
config_
.
epsilon
,
bsz
,
config_
.
hiddenDim
,
stream
,
preLayerNorm
,
config_
.
training
,
vars
);
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
hipStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_in
=
nullptr
)
{
launch_layerNorm_backward
(
out_grad
,
norm_in
,
vars
,
means
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
);
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
gamma
,
const
T
*
betta
,
T
*
gamma_grad
,
T
*
betta_grad
,
hipStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_out
)
{
launch_layerNorm_backward
(
out_grad
,
norm_out
,
vars
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
,
!
config_
.
useMean
,
betta
);
}
void
BackwardFusedAdd
(
int
bsz
,
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
hipStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_in
=
nullptr
)
{
launch_layerNorm_backward_fused_add
(
out_grad1
,
out_grad2
,
norm_in
,
vars
,
means
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
);
}
void
BackwardFusedAdd
(
int
bsz
,
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
gamma
,
const
T
*
betta
,
T
*
gamma_grad
,
T
*
betta_grad
,
hipStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_out
)
{
launch_layerNorm_backward_fused_add
(
out_grad1
,
out_grad2
,
norm_out
,
vars
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
,
!
config_
.
useMean
,
betta
);
}
inline
bool
UseMean
()
const
{
return
config_
.
useMean
;
}
inline
void
SetVar
(
T
*
variance
)
{
if
(
!
variance
)
{
throw
std
::
runtime_error
(
"Normalize variance is null."
);
}
vars
=
variance
;
}
inline
void
SetMean
(
T
*
mean
)
{
if
(
!
mean
)
{
throw
std
::
runtime_error
(
"Normalize mean is null."
);
}
means
=
mean
;
}
private:
Config
config_
;
T
*
vars
;
T
*
means
;
T
*
vals_hat
;
};
csrc/includes/quantizer_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <cooperative_groups.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cassert>
#include <iostream>
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment