Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
4acf0e01
"tests/git@developer.sourcefind.cn:OpenDAS/apex.git" did not exist on "a651e2c24ecf97cbf367fd3f330df36760e1c597"
Commit
4acf0e01
authored
Apr 26, 2023
by
aiss
Browse files
delete hip file
parent
7dd68788
Changes
83
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2864 deletions
+0
-2864
csrc/adagrad/cpu_adagrad_hip.cpp
csrc/adagrad/cpu_adagrad_hip.cpp
+0
-228
csrc/adam/cpu_adam_hip.cpp
csrc/adam/cpu_adam_hip.cpp
+0
-293
csrc/adam/custom_hip_kernel.hip
csrc/adam/custom_hip_kernel.hip
+0
-22
csrc/adam/multi_tensor_adam.hip
csrc/adam/multi_tensor_adam.hip
+0
-164
csrc/adam/multi_tensor_apply_hip.cuh
csrc/adam/multi_tensor_apply_hip.cuh
+0
-129
csrc/common/custom_hip_kernel.hip
csrc/common/custom_hip_kernel.hip
+0
-41
csrc/includes/Timer_hip.h
csrc/includes/Timer_hip.h
+0
-48
csrc/includes/context_hip.h
csrc/includes/context_hip.h
+0
-172
csrc/includes/cpu_adagrad_hip.h
csrc/includes/cpu_adagrad_hip.h
+0
-151
csrc/includes/cpu_adam_hip.h
csrc/includes/cpu_adam_hip.h
+0
-226
csrc/includes/cublas_wrappers_hip.h
csrc/includes/cublas_wrappers_hip.h
+0
-88
csrc/includes/custom_hip_layers.h
csrc/includes/custom_hip_layers.h
+0
-304
csrc/includes/dropout_hip.h
csrc/includes/dropout_hip.h
+0
-77
csrc/includes/ds_transformer_hip.h
csrc/includes/ds_transformer_hip.h
+0
-185
csrc/includes/feed_forward_hip.h
csrc/includes/feed_forward_hip.h
+0
-106
csrc/includes/gelu_hip.h
csrc/includes/gelu_hip.h
+0
-37
csrc/includes/gemm_test_hip.h
csrc/includes/gemm_test_hip.h
+0
-328
csrc/includes/general_kernels_hip.h
csrc/includes/general_kernels_hip.h
+0
-52
csrc/includes/normalize_layer_hip.h
csrc/includes/normalize_layer_hip.h
+0
-203
csrc/includes/quantizer_hip.h
csrc/includes/quantizer_hip.h
+0
-10
No files found.
csrc/adagrad/cpu_adagrad_hip.cpp
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#include "cpu_adagrad_hip.h"
#include <hip/hip_runtime_api.h>
#include <math.h>
#include <omp.h>
#include <torch/extension.h>
#include <iostream>
#include <memory>
#include <type_traits>
#include <unordered_map>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand/hiprand.h"
#include "custom_hip_layers.h"
static
std
::
unordered_map
<
int
,
std
::
shared_ptr
<
void
>>
s_optimizers
;
// C++ interface
void
Adagrad_Optimizer
::
Step_1
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX
<
1
>
(
&
rounded_size
,
_params
,
grads
,
_exp_avg_sq
,
_param_size
,
dev_params
,
half_precision
);
#endif
if
(
_param_size
>
rounded_size
)
{
float
step_size
=
-
1
*
_alpha
;
__half
*
grads_cast_h
;
__half
*
params_cast_h
;
if
(
half_precision
)
{
grads_cast_h
=
reinterpret_cast
<
__half
*>
(
grads
);
params_cast_h
=
reinterpret_cast
<
__half
*>
(
_params
);
}
for
(
size_t
t
=
rounded_size
;
t
<
_param_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
_param_size
)
copy_size
=
_param_size
-
t
;
size_t
offset
=
copy_size
+
t
;
if
((
t
/
TILE
)
>=
2
)
{
hipStreamSynchronize
(
_streams
[
_buf_index
]);
}
#pragma omp parallel for
for
(
size_t
k
=
t
;
k
<
offset
;
k
++
)
{
float
grad
=
half_precision
?
(
float
)
grads_cast_h
[
k
]
:
grads
[
k
];
float
param
=
half_precision
?
(
float
)
params_cast_h
[
k
]
:
_params
[
k
];
float
momentum
=
grads
[
k
];
float
variance
=
_exp_avg_sq
[
k
];
if
(
_weight_decay
>
0
)
{
grad
=
param
*
_weight_decay
+
grad
;
}
variance
+=
grad
*
grad
;
grad
=
sqrt
(
variance
);
grad
+=
_eps
;
grad
=
momentum
/
grad
;
param
=
grad
*
step_size
+
param
;
if
(
dev_params
)
_doubled_buffer
[
_buf_index
][
k
-
t
]
=
param
;
if
(
half_precision
)
params_cast_h
[
k
]
=
(
__half
)
param
;
else
_params
[
k
]
=
param
;
// STORE UPDATE TERM TO GRAD'S MEMORY
grads
[
k
]
=
grad
*
step_size
;
_exp_avg_sq
[
k
]
=
variance
;
}
if
(
dev_params
)
{
launch_param_update
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
(
copy_size
),
_streams
[
_buf_index
]);
_buf_index
=
!
_buf_index
;
}
}
}
}
void
Adagrad_Optimizer
::
Step_4
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX
<
4
>
(
&
rounded_size
,
_params
,
grads
,
_exp_avg_sq
,
_param_size
,
dev_params
,
half_precision
);
#endif
if
(
_param_size
>
rounded_size
)
Step_1
((
_params
+
rounded_size
),
(
grads
+
rounded_size
),
(
_exp_avg_sq
+
rounded_size
),
(
_param_size
-
rounded_size
),
(
dev_params
!=
nullptr
?
(
dev_params
+
rounded_size
)
:
dev_params
),
half_precision
);
}
int
create_adagrad_optimizer
(
int
optimizer_id
,
float
alpha
=
1e-2
,
float
eps
=
1e-8
,
float
weight_decay
=
0
,
bool
should_log
=
false
)
{
auto
opt
=
std
::
make_shared
<
Adagrad_Optimizer
>
(
alpha
,
eps
,
weight_decay
);
s_optimizers
[
optimizer_id
]
=
opt
;
if
(
should_log
)
{
std
::
string
avx_type
=
""
;
#if defined(__AVX512__)
avx_type
=
"AVX512"
;
#else
#if defined(__AVX256__)
avx_type
=
"AVX2"
;
#else
avx_type
=
"scalar"
;
#endif
#endif
printf
(
"Adagrad Optimizer #%d is created with %s arithmetic capability.
\n
"
,
optimizer_id
,
avx_type
.
c_str
());
printf
(
"Config: alpha=%f, weight_decay=%f
\n
"
,
alpha
,
weight_decay
);
}
return
0
;
}
void
Adagrad_Optimizer
::
Step_8
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX
<
8
>
(
&
rounded_size
,
_params
,
grads
,
_exp_avg_sq
,
_param_size
,
dev_params
,
half_precision
);
#endif
if
(
_param_size
>
rounded_size
)
Step_4
((
_params
+
rounded_size
),
(
grads
+
rounded_size
),
(
_exp_avg_sq
+
rounded_size
),
(
_param_size
-
rounded_size
),
(
dev_params
!=
nullptr
?
(
dev_params
+
rounded_size
)
:
dev_params
),
half_precision
);
}
int
ds_adagrad_step
(
int
optimizer_id
,
size_t
step
,
float
lr
,
float
epsilon
,
float
weight_decay
,
torch
::
Tensor
&
params
,
torch
::
Tensor
&
grads
,
torch
::
Tensor
&
exp_avg_sq
)
{
auto
params_c
=
params
.
contiguous
();
auto
grads_c
=
grads
.
contiguous
();
auto
exp_avg_sq_c
=
exp_avg_sq
.
contiguous
();
float
*
params_ptr
=
(
float
*
)
params_c
.
data_ptr
();
float
*
grads_ptr
=
(
float
*
)
grads_c
.
data_ptr
();
float
*
exp_avg_sq_ptr
=
(
float
*
)
exp_avg_sq_c
.
data_ptr
();
std
::
shared_ptr
<
Adagrad_Optimizer
>
opt
=
std
::
static_pointer_cast
<
Adagrad_Optimizer
>
(
s_optimizers
[
optimizer_id
]);
opt
->
IncrementStep
(
step
);
opt
->
update_state
(
lr
,
epsilon
,
weight_decay
);
opt
->
Step_8
(
params_ptr
,
grads_ptr
,
exp_avg_sq_ptr
,
params_c
.
size
(
0
));
opt
->
SynchronizeStreams
();
return
0
;
}
int
ds_adagrad_step_plus_copy
(
int
optimizer_id
,
size_t
step
,
float
lr
,
float
epsilon
,
float
weight_decay
,
torch
::
Tensor
&
params
,
torch
::
Tensor
&
grads
,
torch
::
Tensor
&
exp_avg_sq
,
torch
::
Tensor
&
gpu_params
)
{
auto
params_c
=
params
.
contiguous
();
auto
gpu_params_c
=
gpu_params
.
contiguous
();
auto
exp_avg_sq_c
=
exp_avg_sq
.
contiguous
();
auto
grads_c
=
grads
.
contiguous
();
float
*
params_ptr
=
(
float
*
)
params_c
.
data_ptr
();
float
*
grads_ptr
=
(
float
*
)
grads_c
.
data_ptr
();
__half
*
gpu_params_ptr
=
(
__half
*
)
gpu_params_c
.
data_ptr
();
float
*
exp_avg_sq_ptr
=
(
float
*
)
exp_avg_sq_c
.
data_ptr
();
std
::
shared_ptr
<
Adagrad_Optimizer
>
opt
=
std
::
static_pointer_cast
<
Adagrad_Optimizer
>
(
s_optimizers
[
optimizer_id
]);
opt
->
IncrementStep
(
step
);
opt
->
update_state
(
lr
,
epsilon
,
weight_decay
);
opt
->
Step_8
(
params_ptr
,
grads_ptr
,
exp_avg_sq_ptr
,
params_c
.
size
(
0
),
gpu_params_ptr
,
(
params
.
options
().
dtype
()
==
at
::
kHalf
));
opt
->
SynchronizeStreams
();
return
0
;
}
int
destroy_adagrad_optimizer
(
int
optimizer_id
)
{
s_optimizers
.
erase
(
optimizer_id
);
return
0
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"adagrad_update"
,
&
ds_adagrad_step
,
"DeepSpeed CPU Adagrad update (C++)"
);
m
.
def
(
"adagrad_update_copy"
,
&
ds_adagrad_step_plus_copy
,
"DeepSpeed CPU Adagrad update and param copy (C++)"
);
m
.
def
(
"create_adagrad"
,
&
create_adagrad_optimizer
,
"DeepSpeed CPU Adagrad (C++)"
);
m
.
def
(
"destroy_adagrad"
,
&
destroy_adagrad_optimizer
,
"DeepSpeed CPU Adagrad destroy (C++)"
);
}
csrc/adam/cpu_adam_hip.cpp
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#include "cpu_adam_hip.h"
#include <hip/hip_runtime_api.h>
#include <math.h>
#include <omp.h>
#include <torch/extension.h>
#include <iostream>
#include <memory>
#include <type_traits>
#include <unordered_map>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand/hiprand.h"
#include "custom_hip_layers.h"
static
std
::
unordered_map
<
int
,
std
::
shared_ptr
<
void
>>
s_optimizers
;
// C++ interface
void
Adam_Optimizer
::
Step_1
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX
<
1
>
(
&
rounded_size
,
_params
,
grads
,
_exp_avg
,
_exp_avg_sq
,
_param_size
,
dev_params
,
half_precision
);
#endif
if
(
_param_size
>
rounded_size
)
{
float
betta1_minus1
=
1
-
_betta1
;
float
betta2_minus1
=
1
-
_betta2
;
float
step_size
=
-
1
*
_alpha
/
_bias_correction1
;
float
w_decay
=
-
1
*
_alpha
*
_weight_decay
;
__half
*
grads_cast_h
;
__half
*
params_cast_h
;
if
(
half_precision
)
{
grads_cast_h
=
reinterpret_cast
<
__half
*>
(
grads
);
params_cast_h
=
reinterpret_cast
<
__half
*>
(
_params
);
}
for
(
size_t
t
=
rounded_size
;
t
<
_param_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
_param_size
)
copy_size
=
_param_size
-
t
;
size_t
offset
=
copy_size
+
t
;
if
((
t
/
TILE
)
>=
2
)
{
hipStreamSynchronize
(
_streams
[
_buf_index
]);
}
#pragma omp parallel for
for
(
size_t
k
=
t
;
k
<
offset
;
k
++
)
{
float
grad
=
half_precision
?
(
float
)
grads_cast_h
[
k
]
:
grads
[
k
];
float
param
=
half_precision
?
(
float
)
params_cast_h
[
k
]
:
_params
[
k
];
float
momentum
=
_exp_avg
[
k
];
float
variance
=
_exp_avg_sq
[
k
];
if
(
_weight_decay
>
0
&&
!
_adamw_mode
)
{
grad
=
param
*
_weight_decay
+
grad
;
}
momentum
=
momentum
*
_betta1
;
momentum
=
grad
*
betta1_minus1
+
momentum
;
variance
=
variance
*
_betta2
;
grad
=
grad
*
grad
;
variance
=
grad
*
betta2_minus1
+
variance
;
grad
=
sqrt
(
variance
);
grad
=
grad
*
_bias_correction2
+
_eps
;
grad
=
momentum
/
grad
;
if
(
_weight_decay
>
0
&&
_adamw_mode
)
{
param
+=
w_decay
*
param
;
}
param
=
grad
*
step_size
+
param
;
if
(
dev_params
)
_doubled_buffer
[
_buf_index
][
k
-
t
]
=
param
;
if
(
half_precision
)
params_cast_h
[
k
]
=
(
__half
)
param
;
else
_params
[
k
]
=
param
;
_exp_avg
[
k
]
=
momentum
;
_exp_avg_sq
[
k
]
=
variance
;
}
if
(
dev_params
)
{
launch_param_update
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
(
copy_size
),
_streams
[
_buf_index
]);
_buf_index
=
!
_buf_index
;
}
}
}
}
void
Adam_Optimizer
::
Step_4
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX
<
4
>
(
&
rounded_size
,
_params
,
grads
,
_exp_avg
,
_exp_avg_sq
,
_param_size
,
dev_params
,
half_precision
);
#endif
if
(
_param_size
>
rounded_size
)
Step_1
((
_params
+
rounded_size
),
(
grads
+
rounded_size
),
(
_exp_avg
+
rounded_size
),
(
_exp_avg_sq
+
rounded_size
),
(
_param_size
-
rounded_size
),
(
dev_params
!=
nullptr
?
(
dev_params
+
rounded_size
)
:
dev_params
),
half_precision
);
}
int
create_adam_optimizer
(
int
optimizer_id
,
float
alpha
=
1e-3
,
float
betta1
=
0.9
,
float
betta2
=
0.999
,
float
eps
=
1e-8
,
float
weight_decay
=
0
,
bool
adamw_mode
=
true
,
bool
should_log
=
false
)
{
auto
opt
=
std
::
make_shared
<
Adam_Optimizer
>
(
alpha
,
betta1
,
betta2
,
eps
,
weight_decay
,
adamw_mode
);
s_optimizers
[
optimizer_id
]
=
opt
;
if
(
should_log
)
{
std
::
string
avx_type
=
""
;
#if defined(__AVX512__)
avx_type
=
"AVX512"
;
#else
#if defined(__AVX256__)
avx_type
=
"AVX2"
;
#else
avx_type
=
"scalar"
;
#endif
#endif
printf
(
"Adam Optimizer #%d is created with %s arithmetic capability.
\n
"
,
optimizer_id
,
avx_type
.
c_str
());
printf
(
"Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d
\n
"
,
alpha
,
betta1
,
betta2
,
weight_decay
,
(
int
)
adamw_mode
);
}
return
0
;
}
void
Adam_Optimizer
::
Step_8
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
rounded_size
=
0
;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX
<
8
>
(
&
rounded_size
,
_params
,
grads
,
_exp_avg
,
_exp_avg_sq
,
_param_size
,
dev_params
,
half_precision
);
#endif
if
(
_param_size
>
rounded_size
)
Step_4
((
_params
+
rounded_size
),
(
grads
+
rounded_size
),
(
_exp_avg
+
rounded_size
),
(
_exp_avg_sq
+
rounded_size
),
(
_param_size
-
rounded_size
),
(
dev_params
!=
nullptr
?
(
dev_params
+
rounded_size
)
:
dev_params
),
half_precision
);
}
int
ds_adam_step
(
int
optimizer_id
,
size_t
step
,
float
lr
,
float
beta1
,
float
beta2
,
float
epsilon
,
float
weight_decay
,
bool
bias_correction
,
torch
::
Tensor
&
params
,
torch
::
Tensor
&
grads
,
torch
::
Tensor
&
exp_avg
,
torch
::
Tensor
&
exp_avg_sq
)
{
auto
params_c
=
params
.
contiguous
();
auto
grads_c
=
grads
.
contiguous
();
auto
exp_avg_c
=
exp_avg
.
contiguous
();
auto
exp_avg_sq_c
=
exp_avg_sq
.
contiguous
();
// assert(params.options().dtype() == grads.options().dtype());
float
*
params_ptr
=
(
float
*
)
params_c
.
data_ptr
();
float
*
grads_ptr
=
(
float
*
)
grads_c
.
data_ptr
();
float
*
exp_avg_ptr
=
(
float
*
)
exp_avg_c
.
data_ptr
();
float
*
exp_avg_sq_ptr
=
(
float
*
)
exp_avg_sq_c
.
data_ptr
();
std
::
shared_ptr
<
Adam_Optimizer
>
opt
=
std
::
static_pointer_cast
<
Adam_Optimizer
>
(
s_optimizers
[
optimizer_id
]);
opt
->
IncrementStep
(
step
,
beta1
,
beta2
);
opt
->
update_state
(
lr
,
epsilon
,
weight_decay
,
bias_correction
);
opt
->
Step_8
(
params_ptr
,
grads_ptr
,
exp_avg_ptr
,
exp_avg_sq_ptr
,
params_c
.
size
(
0
),
nullptr
,
(
params
.
options
().
dtype
()
==
at
::
kHalf
));
opt
->
SynchronizeStreams
();
return
0
;
}
int
ds_adam_step_plus_copy
(
int
optimizer_id
,
size_t
step
,
float
lr
,
float
beta1
,
float
beta2
,
float
epsilon
,
float
weight_decay
,
bool
bias_correction
,
torch
::
Tensor
&
params
,
torch
::
Tensor
&
grads
,
torch
::
Tensor
&
exp_avg
,
torch
::
Tensor
&
exp_avg_sq
,
torch
::
Tensor
&
gpu_params
)
{
auto
params_c
=
params
.
contiguous
();
auto
gpu_params_c
=
gpu_params
.
contiguous
();
auto
exp_avg_c
=
exp_avg
.
contiguous
();
auto
exp_avg_sq_c
=
exp_avg_sq
.
contiguous
();
auto
grads_c
=
grads
.
contiguous
();
float
*
params_ptr
=
(
float
*
)
params_c
.
data_ptr
();
float
*
grads_ptr
=
(
float
*
)
grads_c
.
data_ptr
();
__half
*
gpu_params_ptr
=
(
__half
*
)
gpu_params_c
.
data_ptr
();
float
*
exp_avg_ptr
=
(
float
*
)
exp_avg_c
.
data_ptr
();
float
*
exp_avg_sq_ptr
=
(
float
*
)
exp_avg_sq_c
.
data_ptr
();
std
::
shared_ptr
<
Adam_Optimizer
>
opt
=
std
::
static_pointer_cast
<
Adam_Optimizer
>
(
s_optimizers
[
optimizer_id
]);
opt
->
IncrementStep
(
step
,
beta1
,
beta2
);
opt
->
update_state
(
lr
,
epsilon
,
weight_decay
,
bias_correction
);
opt
->
Step_8
(
params_ptr
,
grads_ptr
,
exp_avg_ptr
,
exp_avg_sq_ptr
,
params_c
.
size
(
0
),
gpu_params_ptr
,
(
params
.
options
().
dtype
()
==
at
::
kHalf
));
opt
->
SynchronizeStreams
();
return
0
;
}
int
destroy_adam_optimizer
(
int
optimizer_id
)
{
s_optimizers
.
erase
(
optimizer_id
);
return
0
;
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"adam_update"
,
&
ds_adam_step
,
"DeepSpeed CPU Adam update (C++)"
);
m
.
def
(
"adam_update_copy"
,
&
ds_adam_step_plus_copy
,
"DeepSpeed CPU Adam update and param copy (C++)"
);
m
.
def
(
"create_adam"
,
&
create_adam_optimizer
,
"DeepSpeed CPU Adam (C++)"
);
m
.
def
(
"destroy_adam"
,
&
destroy_adam_optimizer
,
"DeepSpeed CPU Adam destroy (C++)"
);
}
csrc/adam/custom_hip_kernel.hip
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
__global__ void param_update_kernel(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < size) { output[id] = (__half)input[id]; }
}
void launch_param_update(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
csrc/adam/multi_tensor_adam.hip
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/Exceptions.h>
// Another possibility:
// #include <torch/all.h>
#include <assert.h>
#include "multi_tensor_apply_hip.cuh"
#include "type_shim_hip.h"
#define BLOCK_SIZE 512
#define ILP 4
typedef enum {
ADAM_MODE_0 = 0, // L2 regularization mode
ADAM_MODE_1 = 1 // Decoupled weight decay mode(AdamW)
} adamMode_t;
using MATH_T = float;
template <typename T>
struct AdamFunctor {
__device__ __forceinline__ void operator()(int chunk_size,
volatile int* noop_gmem,
TensorListMetadata<4>& tl,
const float beta1,
const float beta2,
const float beta1_correction,
const float beta2_correction,
const float epsilon,
const float lr,
adamMode_t mode,
const float decay)
{
// I'd like this kernel to propagate infs/nans.
// if(*noop_gmem == 1)
// return;
int tensor_loc = tl.block_to_tensor[blockIdx.x];
// potentially use to pass in list of scalar
// int tensor_num = tl.start_tensor_this_launch + tensor_loc;
int chunk_idx = tl.block_to_chunk[blockIdx.x];
int n = tl.sizes[tensor_loc];
T* g = (T*)tl.addresses[0][tensor_loc];
g += chunk_idx * chunk_size;
T* p = (T*)tl.addresses[1][tensor_loc];
p += chunk_idx * chunk_size;
T* m = (T*)tl.addresses[2][tensor_loc];
m += chunk_idx * chunk_size;
T* v = (T*)tl.addresses[3][tensor_loc];
v += chunk_idx * chunk_size;
n -= chunk_idx * chunk_size;
// see note in multi_tensor_scale_kernel.cu
for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
MATH_T r_g[ILP];
MATH_T r_p[ILP];
MATH_T r_m[ILP];
MATH_T r_v[ILP];
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
int i = i_start + threadIdx.x + ii * blockDim.x;
if (i < n && i < chunk_size) {
r_g[ii] = g[i];
r_p[ii] = p[i];
r_m[ii] = m[i];
r_v[ii] = v[i];
} else {
r_g[ii] = MATH_T(0);
r_p[ii] = MATH_T(0);
r_m[ii] = MATH_T(0);
r_v[ii] = MATH_T(0);
}
}
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
if (mode == ADAM_MODE_0) { // L2
r_g[ii] = r_g[ii] + (decay * r_p[ii]);
r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
MATH_T update = next_m_unbiased / denom;
r_p[ii] = r_p[ii] - (lr * update);
} else { // weight decay
r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
r_p[ii] = r_p[ii] - (lr * update);
}
}
#pragma unroll
for (int ii = 0; ii < ILP; ii++) {
int i = i_start + threadIdx.x + ii * blockDim.x;
if (i < n && i < chunk_size) {
p[i] = r_p[ii];
m[i] = r_m[ii];
v[i] = r_v[ii];
}
}
}
}
};
void multi_tensor_adam_cuda(int chunk_size,
at::Tensor noop_flag,
std::vector<std::vector<at::Tensor>> tensor_lists,
const float lr,
const float beta1,
const float beta2,
const float epsilon,
const int step,
const int mode,
const int bias_correction,
const float weight_decay)
{
using namespace at;
// Handle bias correction mode
float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
if (bias_correction == 1) {
bias_correction1 = 1 - ::pow(beta1, step);
bias_correction2 = 1 - ::pow(beta2, step);
}
// Assume single type across p,g,m1,m2 now
DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
0,
"adam",
multi_tensor_apply<4>(BLOCK_SIZE,
chunk_size,
noop_flag,
tensor_lists,
AdamFunctor<scalar_t_0>(),
beta1,
beta2,
bias_correction1,
bias_correction2,
epsilon,
lr,
(adamMode_t)mode,
weight_decay);)
AT_CUDA_CHECK(hipGetLastError());
}
csrc/adam/multi_tensor_apply_hip.cuh
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/* Copyright 2020 The Microsoft DeepSpeed Team
Copyright NVIDIA/apex
This file is adapted from fused adam in NVIDIA/apex, commit a109f85
*/
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/Exceptions.h>
#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
#include "compat.h"
#include <assert.h>
// #include <iostream>
// This header is the one-stop shop for all your multi-tensor apply needs.
// TODO: Kernel arg size limit may be <4KB for some other cards (ie Jetson)
constexpr
int
depth_to_max_tensors
[
5
]
=
{
110
,
64
,
48
,
36
,
30
};
constexpr
int
depth_to_max_blocks
[
5
]
=
{
320
,
320
,
320
,
320
,
320
};
template
<
int
n
>
struct
TensorListMetadata
{
void
*
addresses
[
n
][
depth_to_max_tensors
[
n
-
1
]];
int
sizes
[
depth_to_max_tensors
[
n
-
1
]];
unsigned
char
block_to_tensor
[
depth_to_max_blocks
[
n
-
1
]];
int
block_to_chunk
[
depth_to_max_blocks
[
n
-
1
]];
// I fear this needs to be a full int.
int
start_tensor_this_launch
;
};
template
<
typename
T
,
typename
U
,
typename
...
ArgTypes
>
__global__
void
multi_tensor_apply_kernel
(
int
chunk_size
,
volatile
int
*
noop_flag
,
T
tl
,
U
callable
,
ArgTypes
...
args
)
{
// Hand the chunk information to the user-supplied functor to process however it likes.
callable
(
chunk_size
,
noop_flag
,
tl
,
args
...);
}
template
<
int
depth
,
typename
T
,
typename
...
ArgTypes
>
void
multi_tensor_apply
(
int
block_size
,
int
chunk_size
,
const
at
::
Tensor
&
noop_flag
,
const
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>&
tensor_lists
,
T
callable
,
ArgTypes
...
args
)
{
TORCH_CHECK
(
tensor_lists
.
size
()
==
depth
,
"tensor_lists.size() != depth"
);
int
len0
=
tensor_lists
[
0
].
size
();
TORCH_CHECK
(
len0
>
0
,
"tensor_lists[0].size() is not > 0"
);
auto
ref_device
=
tensor_lists
[
0
][
0
].
device
();
TORCH_CHECK
(
ref_device
.
type
()
==
at
::
kCUDA
,
"expected input to be on cuda"
);
for
(
int
l
=
0
;
l
<
tensor_lists
.
size
();
l
++
)
// No range-based for because I need indices
{
TORCH_CHECK
(
tensor_lists
[
l
].
size
()
==
len0
,
"Size mismatch among tensor lists"
);
for
(
int
t
=
0
;
t
<
tensor_lists
[
l
].
size
();
t
++
)
{
// TODO: Print which tensor fails.
bool
contiguous_memory
=
tensor_lists
[
l
][
t
].
is_contiguous
();
#ifdef VERSION_GE_1_5
contiguous_memory
=
(
contiguous_memory
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast
));
#endif
TORCH_CHECK
(
contiguous_memory
,
"A tensor was not contiguous."
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
device
()
==
ref_device
,
"A tensor was not on the same device as the first tensor"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
numel
()
==
tensor_lists
[
0
][
t
].
numel
(),
"Size mismatch"
);
}
}
int
ntensors
=
tensor_lists
[
0
].
size
();
TensorListMetadata
<
depth
>
tl
;
const
at
::
hip
::
OptionalHIPGuardMasqueradingAsCUDA
device_guard
(
device_of
(
tensor_lists
[
0
][
0
]));
auto
stream
=
at
::
hip
::
getCurrentHIPStreamMasqueradingAsCUDA
();
tl
.
start_tensor_this_launch
=
0
;
int
loc_block_info
=
0
;
int
loc_tensor_info
=
0
;
for
(
int
t
=
0
;
t
<
ntensors
;
t
++
)
{
tl
.
sizes
[
loc_tensor_info
]
=
tensor_lists
[
0
][
t
].
numel
();
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
loc_tensor_info
]
=
tensor_lists
[
d
][
t
].
data_ptr
();
loc_tensor_info
++
;
int
chunks_this_tensor
=
(
tensor_lists
[
0
][
t
].
numel
()
+
chunk_size
-
1
)
/
chunk_size
;
for
(
int
chunk
=
0
;
chunk
<
chunks_this_tensor
;
chunk
++
)
{
// std::cout << chunks_this_tensor << std::endl;
tl
.
block_to_tensor
[
loc_block_info
]
=
loc_tensor_info
-
1
;
tl
.
block_to_chunk
[
loc_block_info
]
=
chunk
;
loc_block_info
++
;
bool
tensors_full
=
(
loc_tensor_info
==
depth_to_max_tensors
[
depth
-
1
]
&&
chunk
==
chunks_this_tensor
-
1
);
bool
blocks_full
=
(
loc_block_info
==
depth_to_max_blocks
[
depth
-
1
]);
bool
last_chunk
=
(
t
==
ntensors
-
1
&&
chunk
==
chunks_this_tensor
-
1
);
if
(
tensors_full
||
blocks_full
||
last_chunk
)
{
// using accscalar_t = acc_type<scalar_t, true>;
hipLaunchKernelGGL
((
multi_tensor_apply_kernel
),
dim3
(
loc_block_info
),
dim3
(
block_size
),
0
,
stream
,
chunk_size
,
noop_flag
.
DATA_PTR
<
int
>
(),
tl
,
callable
,
args
...);
AT_CUDA_CHECK
(
hipGetLastError
());
// Reset. The control flow possibilities here make my brain hurt.
loc_block_info
=
0
;
if
(
chunk
==
chunks_this_tensor
-
1
)
{
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
// std::endl;
loc_tensor_info
=
0
;
tl
.
start_tensor_this_launch
=
t
+
1
;
}
else
{
// std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
// std::endl;
tl
.
sizes
[
0
]
=
tl
.
sizes
[
loc_tensor_info
-
1
];
for
(
int
d
=
0
;
d
<
depth
;
d
++
)
tl
.
addresses
[
d
][
0
]
=
tl
.
addresses
[
d
][
loc_tensor_info
-
1
];
loc_tensor_info
=
1
;
tl
.
start_tensor_this_launch
=
t
;
}
}
}
}
}
csrc/common/custom_hip_kernel.hip
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
__global__ void param_update_kernel(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < size) { output[id] = (__half)input[id]; }
}
void launch_param_update(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
__global__ void param_update_kernel_half(const float* input, __half* output, int size)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
__half2* output_cast = reinterpret_cast<__half2*>(output);
if (id < size) {
float input_f = input[id];
__half2* input_h = reinterpret_cast<__half2*>(&input_f);
output_cast[id] = *input_h;
}
}
void launch_param_update_half(const float* input, __half* output, int size, hipStream_t stream)
{
int threads = 1024;
size /= 2;
dim3 grid_dim((size - 1) / threads + 1);
dim3 block_dim(threads);
hipLaunchKernelGGL(( param_update_kernel_half), dim3(grid_dim), dim3(block_dim), 0, stream, input, output, size);
}
csrc/includes/Timer_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#ifndef __TIMER_H__
#define __TIMER_H__
#include <hip/hip_runtime.h>
#include <chrono>
#include "hip/hip_runtime.h"
class
GPUTimer
{
hipEvent_t
start
,
stop
;
public:
GPUTimer
()
{
hipEventCreate
(
&
start
);
hipEventCreate
(
&
stop
);
}
~
GPUTimer
()
{
hipEventDestroy
(
start
);
hipEventDestroy
(
stop
);
}
inline
void
Record
()
{
hipEventRecord
(
start
);
}
inline
void
Elapsed
(
float
&
time_elapsed
)
{
hipEventRecord
(
stop
);
hipEventSynchronize
(
stop
);
hipEventElapsedTime
(
&
time_elapsed
,
start
,
stop
);
}
};
class
CPUTimer
{
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
public:
CPUTimer
()
:
start
(
std
::
chrono
::
high_resolution_clock
::
now
())
{}
inline
void
Reset
()
{
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
inline
float
Elapsed
()
{
auto
temp
=
start
;
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
return
(
float
)(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
start
-
temp
).
count
()
/
1e3
);
}
};
#endif
csrc/includes/context_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <ATen/hip/HIPContext.h>
#include <hip/hip_runtime_api.h>
#include <cassert>
#include <iostream>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand/hiprand.h"
#include "gemm_test_hip.h"
#define WARP_SIZE 32
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
#define DS_CUDA_NUM_THREADS 512
#define DS_MAXIMUM_NUM_BLOCKS 262144
inline
int
DS_GET_BLOCKS
(
const
int
N
)
{
return
(
std
::
max
)(
(
std
::
min
)((
N
+
DS_CUDA_NUM_THREADS
-
1
)
/
DS_CUDA_NUM_THREADS
,
DS_MAXIMUM_NUM_BLOCKS
),
// Use at least 1 block, since CUDA does not allow empty block
1
);
}
class
Context
{
public:
Context
()
:
_workspace
(
nullptr
),
_seed
(
42
),
_curr_offset
(
0
)
{
hiprandCreateGenerator
(
&
_gen
,
HIPRAND_RNG_PSEUDO_DEFAULT
);
hiprandSetPseudoRandomGeneratorSeed
(
_gen
,
123
);
if
(
rocblas_create_handle
(
&
_cublasHandle
)
!=
rocblas_status_success
)
{
auto
message
=
std
::
string
(
"Fail to create cublas handle."
);
std
::
cerr
<<
message
<<
std
::
endl
;
throw
std
::
runtime_error
(
message
);
}
}
virtual
~
Context
()
{
rocblas_destroy_handle
(
_cublasHandle
);
hipFree
(
_workspace
);
}
static
Context
&
Instance
()
{
static
Context
_ctx
;
return
_ctx
;
}
void
SetWorkSpace
(
void
*
workspace
)
{
if
(
!
workspace
)
{
throw
std
::
runtime_error
(
"Workspace is null."
);
}
_workspace
=
workspace
;
}
void
*
GetWorkSpace
()
{
return
_workspace
;
}
hiprandGenerator_t
&
GetRandGenerator
()
{
return
_gen
;
}
hipStream_t
GetCurrentStream
()
{
// get current pytorch stream.
hipStream_t
stream
=
at
::
hip
::
getCurrentHIPStreamMasqueradingAsCUDA
();
return
stream
;
}
hipStream_t
GetNewStream
()
{
return
at
::
hip
::
getStreamFromPoolMasqueradingAsCUDA
();
}
rocblas_handle
GetCublasHandle
()
{
return
_cublasHandle
;
}
std
::
pair
<
uint64_t
,
uint64_t
>
IncrementOffset
(
uint64_t
offset_inc
)
{
uint64_t
offset
=
_curr_offset
;
_curr_offset
+=
offset_inc
;
return
std
::
pair
<
uint64_t
,
uint64_t
>
(
_seed
,
offset
);
}
void
SetSeed
(
uint64_t
new_seed
)
{
_seed
=
new_seed
;
}
void
TestGemmFP16
(
bool
test_gemm
,
int
batch_size
,
int
seq_len
,
int
head_num
,
int
size_per_head
)
{
// avoid rerun.
if
(
_gemm_algos
.
size
()
>
0
)
return
;
if
(
test_gemm
)
{
rocblas_handle
handle
=
GetCublasHandle
();
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_qkv_fw
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
head_num
*
size_per_head
,
// N
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_inter
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
4
*
head_num
*
size_per_head
,
// N
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_output
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
head_num
*
size_per_head
,
// N
4
*
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
StridedGemmTest
<
__half
>>
test_attn_scores
(
new
StridedGemmTest
<
__half
>
(
batch_size
*
head_num
,
// batch
seq_len
,
// M
seq_len
,
// N
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
StridedGemmTest
<
__half
>>
test_attn_context
(
new
StridedGemmTest
<
__half
>
(
batch_size
*
head_num
,
// batch
size_per_head
,
// M
seq_len
,
// N
seq_len
,
// K
rocblas_operation_none
,
rocblas_operation_none
,
handle
));
_gemm_algos
.
push_back
(
test_qkv_fw
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_inter
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_output
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_attn_scores
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_attn_context
->
TestAlgo
(
100
));
}
else
{
// Use default algo.
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
}
}
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
GetGemmAlgos
()
const
{
return
_gemm_algos
;
}
private:
hiprandGenerator_t
_gen
;
rocblas_handle
_cublasHandle
;
void
*
_workspace
;
uint64_t
_seed
;
uint64_t
_curr_offset
;
std
::
vector
<
std
::
array
<
int
,
3
>>
_gemm_algos
;
};
csrc/includes/cpu_adagrad_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <hip/hip_fp16.h>
#include <hip/hip_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class
Adagrad_Optimizer
{
public:
Adagrad_Optimizer
(
float
alpha
=
1e-2
,
float
eps
=
1e-8
,
float
weight_decay
=
0
)
:
_alpha
(
alpha
),
_eps
(
eps
),
_weight_decay
(
weight_decay
),
_buf_index
(
false
)
{
hipHostMalloc
((
void
**
)
_doubled_buffer
,
TILE
*
sizeof
(
float
));
hipHostMalloc
((
void
**
)(
_doubled_buffer
+
1
),
TILE
*
sizeof
(
float
));
_streams
[
0
]
=
Context
::
Instance
().
GetCurrentStream
();
_streams
[
1
]
=
Context
::
Instance
().
GetNewStream
();
}
~
Adagrad_Optimizer
()
{
hipHostFree
(
_doubled_buffer
[
0
]);
hipHostFree
(
_doubled_buffer
[
1
]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
,
bool
half_precision
=
false
);
#endif
STEP
(
1
)
STEP
(
4
)
STEP
(
8
)
inline
void
SynchronizeStreams
()
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
hipStreamSynchronize
(
_streams
[
i
]);
}
inline
void
IncrementStep
(
size_t
step
)
{
_step
++
;
if
(
_step
!=
step
)
{
_step
=
step
;
}
}
inline
void
update_state
(
float
lr
,
float
epsilon
,
float
weight_decay
)
{
_alpha
=
lr
;
_eps
=
epsilon
;
_weight_decay
=
weight_decay
;
}
private:
float
_alpha
;
float
_eps
;
float
_weight_decay
;
float
_betta1_t
;
float
_betta2_t
;
size_t
_step
;
float
*
_doubled_buffer
[
2
];
bool
_buf_index
;
hipStream_t
_streams
[
2
];
};
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Adagrad_Optimizer
::
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
new_rounded_size
=
0
;
AVX_Data
eps_4
;
eps_4
.
data
=
SIMD_SET
(
_eps
);
float
step_size
=
-
1
*
_alpha
;
AVX_Data
step_size_4
;
step_size_4
.
data
=
SIMD_SET
(
step_size
);
AVX_Data
weight_decay4
;
if
(
_weight_decay
>
0
)
weight_decay4
.
data
=
SIMD_SET
(
_weight_decay
);
new_rounded_size
=
ROUND_DOWN
(
_param_size
,
SIMD_WIDTH
*
span
);
for
(
size_t
t
=
0
;
t
<
new_rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
new_rounded_size
)
copy_size
=
new_rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
if
((
t
/
TILE
)
>=
2
)
{
hipStreamSynchronize
(
_streams
[
_buf_index
]);
}
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
SIMD_WIDTH
*
span
)
{
AVX_Data
grad_4
[
span
];
simd_load
<
span
>
(
grad_4
,
grads
+
i
,
half_precision
);
AVX_Data
momentum_4
[
span
];
simd_load
<
span
>
(
momentum_4
,
grads
+
i
,
false
);
AVX_Data
variance_4
[
span
];
simd_load
<
span
>
(
variance_4
,
_exp_avg_sq
+
i
,
false
);
AVX_Data
param_4
[
span
];
simd_load
<
span
>
(
param_4
,
_params
+
i
,
half_precision
);
if
(
_weight_decay
>
0
)
{
simd_fma
<
span
>
(
grad_4
,
param_4
,
weight_decay4
,
grad_4
);
}
simd_fma
<
span
>
(
variance_4
,
grad_4
,
grad_4
,
variance_4
);
simd_sqrt
<
span
>
(
grad_4
,
variance_4
);
simd_add
<
span
>
(
grad_4
,
grad_4
,
eps_4
);
simd_div
<
span
>
(
grad_4
,
momentum_4
,
grad_4
);
simd_fma
<
span
>
(
param_4
,
grad_4
,
step_size_4
,
param_4
);
simd_store
<
span
>
(
_params
+
i
,
param_4
,
half_precision
);
if
(
dev_params
)
{
simd_store
<
span
>
(
_doubled_buffer
[
_buf_index
]
+
(
i
-
t
),
param_4
,
half_precision
);
}
simd_store
<
span
>
(
_exp_avg_sq
+
i
,
variance_4
,
false
);
}
if
(
dev_params
)
{
if
(
half_precision
)
launch_param_update_half
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
else
launch_param_update
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
_buf_index
=
!
_buf_index
;
}
}
*
rounded_size
=
new_rounded_size
;
}
#endif
csrc/includes/cpu_adam_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
#include <hip/hip_fp16.h>
#include <hip/hip_runtime_api.h>
#include <stdio.h>
#include <cassert>
#include "hip/hip_runtime.h"
#include "custom_hip_layers.h"
#include "simd.h"
#define STEP(SPAN) \
void Step_##SPAN(float* _params, \
float* grads, \
float* _exp_avg, \
float* _exp_avg_sq, \
size_t _param_size, \
__half* dev_param = nullptr, \
bool half_precision = false);
class
Adam_Optimizer
{
public:
Adam_Optimizer
(
float
alpha
=
1e-3
,
float
betta1
=
0.9
,
float
betta2
=
0.999
,
float
eps
=
1e-8
,
float
weight_decay
=
0
,
bool
adamw_mode
=
true
)
:
_alpha
(
alpha
),
_betta1
(
betta1
),
_betta2
(
betta2
),
_eps
(
eps
),
_weight_decay
(
weight_decay
),
_betta1_t
(
1.0
),
_betta2_t
(
1.0
),
_step
(
0
),
_buf_index
(
false
),
_adamw_mode
(
adamw_mode
)
{
hipHostMalloc
((
void
**
)
_doubled_buffer
,
TILE
*
sizeof
(
float
));
hipHostMalloc
((
void
**
)(
_doubled_buffer
+
1
),
TILE
*
sizeof
(
float
));
_streams
[
0
]
=
Context
::
Instance
().
GetCurrentStream
();
_streams
[
1
]
=
Context
::
Instance
().
GetNewStream
();
}
~
Adam_Optimizer
()
{
hipHostFree
(
_doubled_buffer
[
0
]);
hipHostFree
(
_doubled_buffer
[
1
]);
}
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
,
bool
half_precision
=
false
);
#endif
STEP
(
1
)
STEP
(
4
)
STEP
(
8
)
inline
void
SynchronizeStreams
()
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
hipStreamSynchronize
(
_streams
[
i
]);
}
inline
void
IncrementStep
(
size_t
step
,
float
beta1
,
float
beta2
)
{
if
(
beta1
!=
_betta1
||
beta2
!=
_betta2
)
{
_step
=
step
;
_betta1
=
beta1
;
_betta2
=
beta2
;
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
}
else
{
_step
++
;
if
(
_step
!=
step
)
{
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
_step
=
step
;
}
else
{
_betta1_t
*=
_betta1
;
_betta2_t
*=
_betta2
;
}
}
}
inline
void
update_state
(
float
lr
,
float
epsilon
,
float
weight_decay
,
bool
bias_correction
)
{
_alpha
=
lr
;
_eps
=
epsilon
;
_weight_decay
=
weight_decay
;
_bias_correction1
=
1.0
f
;
_bias_correction2
=
1.0
f
;
if
(
bias_correction
==
1
)
{
_bias_correction1
=
1
-
_betta1_t
;
_bias_correction2
=
1
/
sqrt
(
1
-
_betta2_t
);
}
}
private:
float
_alpha
;
float
_betta1
;
float
_betta2
;
float
_eps
;
float
_weight_decay
;
float
_betta1_t
;
float
_betta2_t
;
size_t
_step
;
float
_bias_correction1
;
float
_bias_correction2
;
float
*
_doubled_buffer
[
2
];
bool
_buf_index
;
bool
_adamw_mode
;
hipStream_t
_streams
[
2
];
};
#if defined(__AVX512__) or defined(__AVX256__)
template
<
int
span
>
void
Adam_Optimizer
::
Step_AVX
(
size_t
*
rounded_size
,
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
,
bool
half_precision
)
{
size_t
new_rounded_size
=
0
;
AVX_Data
betta1_4
;
betta1_4
.
data
=
SIMD_SET
(
_betta1
);
AVX_Data
betta2_4
;
betta2_4
.
data
=
SIMD_SET
(
_betta2
);
float
betta1_minus1
=
1
-
_betta1
;
float
betta2_minus1
=
1
-
_betta2
;
AVX_Data
betta1_minus1_4
;
betta1_minus1_4
.
data
=
SIMD_SET
(
betta1_minus1
);
AVX_Data
betta2_minus1_4
;
betta2_minus1_4
.
data
=
SIMD_SET
(
betta2_minus1
);
AVX_Data
bias2_sqrt
;
bias2_sqrt
.
data
=
SIMD_SET
(
_bias_correction2
);
AVX_Data
eps_4
;
eps_4
.
data
=
SIMD_SET
(
_eps
);
float
step_size
=
-
1
*
_alpha
/
_bias_correction1
;
AVX_Data
step_size_4
;
step_size_4
.
data
=
SIMD_SET
(
step_size
);
float
w_decay
=
-
1
*
_alpha
*
_weight_decay
;
AVX_Data
weight_decay4
;
if
(
_weight_decay
>
0
)
weight_decay4
.
data
=
(
_adamw_mode
?
SIMD_SET
(
w_decay
)
:
SIMD_SET
(
_weight_decay
));
new_rounded_size
=
ROUND_DOWN
(
_param_size
,
SIMD_WIDTH
*
span
);
for
(
size_t
t
=
0
;
t
<
new_rounded_size
;
t
+=
TILE
)
{
size_t
copy_size
=
TILE
;
if
((
t
+
TILE
)
>
new_rounded_size
)
copy_size
=
new_rounded_size
-
t
;
size_t
offset
=
copy_size
+
t
;
if
((
t
/
TILE
)
>=
2
)
{
hipStreamSynchronize
(
_streams
[
_buf_index
]);
}
#pragma omp parallel for
for
(
size_t
i
=
t
;
i
<
offset
;
i
+=
SIMD_WIDTH
*
span
)
{
AVX_Data
grad_4
[
span
];
simd_load
<
span
>
(
grad_4
,
grads
+
i
,
half_precision
);
AVX_Data
momentum_4
[
span
];
simd_load
<
span
>
(
momentum_4
,
_exp_avg
+
i
,
false
);
AVX_Data
variance_4
[
span
];
simd_load
<
span
>
(
variance_4
,
_exp_avg_sq
+
i
,
false
);
AVX_Data
param_4
[
span
];
simd_load
<
span
>
(
param_4
,
_params
+
i
,
half_precision
);
if
(
_weight_decay
>
0
&&
!
_adamw_mode
)
{
simd_fma
<
span
>
(
grad_4
,
param_4
,
weight_decay4
,
grad_4
);
}
simd_mul
<
span
>
(
momentum_4
,
momentum_4
,
betta1_4
);
simd_fma
<
span
>
(
momentum_4
,
grad_4
,
betta1_minus1_4
,
momentum_4
);
simd_mul
<
span
>
(
variance_4
,
variance_4
,
betta2_4
);
simd_mul
<
span
>
(
grad_4
,
grad_4
,
grad_4
);
simd_fma
<
span
>
(
variance_4
,
grad_4
,
betta2_minus1_4
,
variance_4
);
simd_sqrt
<
span
>
(
grad_4
,
variance_4
);
simd_fma
<
span
>
(
grad_4
,
grad_4
,
bias2_sqrt
,
eps_4
);
simd_div
<
span
>
(
grad_4
,
momentum_4
,
grad_4
);
if
(
_weight_decay
>
0
&&
_adamw_mode
)
{
simd_fma
<
span
>
(
param_4
,
param_4
,
weight_decay4
,
param_4
);
}
simd_fma
<
span
>
(
param_4
,
grad_4
,
step_size_4
,
param_4
);
simd_store
<
span
>
(
_params
+
i
,
param_4
,
half_precision
);
if
(
dev_params
)
{
simd_store
<
span
>
(
_doubled_buffer
[
_buf_index
]
+
(
i
-
t
),
param_4
,
half_precision
);
}
simd_store
<
span
>
(
_exp_avg
+
i
,
momentum_4
,
false
);
simd_store
<
span
>
(
_exp_avg_sq
+
i
,
variance_4
,
false
);
}
if
(
dev_params
)
{
if
(
half_precision
)
launch_param_update_half
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
else
launch_param_update
(
_doubled_buffer
[
_buf_index
],
dev_params
+
t
,
copy_size
,
_streams
[
_buf_index
]);
_buf_index
=
!
_buf_index
;
}
}
*
rounded_size
=
new_rounded_size
;
}
#endif
csrc/includes/cublas_wrappers_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <assert.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#ifndef __HIP_PLATFORM_HCC__
#include <mma.h>
#endif
#include <stdio.h>
int
cublas_gemm_ex
(
rocblas_handle
handle
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
#endif
int
cublas_gemm_ex
(
rocblas_handle
handle
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
#endif
int
cublas_strided_batched_gemm
(
rocblas_handle
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
rocblas_operation
op_A
,
rocblas_operation
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
#endif
int
cublas_strided_batched_gemm
(
rocblas_handle
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
rocblas_operation
op_A
,
rocblas_operation
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
algo
=
rocblas_gemm_algo_standard
);
#else
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
#endif
csrc/includes/custom_hip_layers.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <hiprand/hiprand_kernel.h>
#include "context_hip.h"
#include "cublas_wrappers_hip.h"
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define MAX_THREADS 1024
#define THREADS 256
#define MAX_THREAD_STRIDE 32
#define TILE_DIM 32
// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
#define MAX_THREAD_ITERATIONS 8 // Maximum 8K
#define MAX_WARP_NUM 32
#define MAX_REGISTERS 256
#define MAX_REG 256
#define WARP_SIZE_BITS 5
template
<
typename
T
>
void
launch_quantize_kernel
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_sr_quantize_kernel
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_quantize_kernel_asym
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_sr_quantize_kernel_asym
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
hipStream_t
stream
);
// Fused bias add with gelu activation
template
<
typename
T
>
void
launch_bias_gelu
(
const
T
*
input
,
const
T
*
bias
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_gelu
(
const
T
*
input
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_d_gelu
(
T
*
d_output
,
const
T
*
input
,
const
T
*
bias
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
// Custom fused bias add with layer normalization
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
,
T
*
means
);
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward_nreversible
(
const
T
*
out_grad
,
const
T
*
vals
,
const
T
*
out_grad_trans
,
const
T
*
vals_trans
,
const
T
*
means
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
Transpose
(
const
T
*
inp_mat
,
T
*
out_mat
,
int
rows
,
int
cols
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward_v2
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
hipStream_t
stream
);
// Custom softmax with scaling and attention mask addition
template
<
typename
T
>
void
launch_attn_softmax
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
heads
,
int
sequence_length
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_transform_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
hipStream_t
stream
);
// Custom bias add
template
<
typename
T
>
void
launch_bias_add_transform_0213
(
T
*
outputs
,
const
T
*
vals
,
const
T
*
bias
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
hipStream_t
stream
,
int
trans_count
);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template
<
typename
T
>
void
launch_transform4d_0213
(
T
*
out
,
const
T
*
in
,
int
batch_size
,
int
heads
,
int
seq_length
,
int
hidden_dim
,
hipStream_t
stream
,
int
trans_count
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
int
dim
,
float
ratio
,
hipStream_t
stream
,
bool
bwd
=
false
);
template
<
typename
T
>
void
launch_dropout
(
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_fuse_transpose_bias_kernel
(
const
T
*
inp
,
T
*
out
,
int
rows
,
int
cols
,
hipStream_t
stream
);
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
hipStream_t
stream
);
void
launch_param_update_half
(
const
float
*
input
,
__half
*
output
,
int
size
,
hipStream_t
stream
);
csrc/includes/dropout_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
template
<
typename
T
>
class
Dropout
{
public:
struct
Config
{
float
ratio
;
uint32_t
dim
;
bool
training
;
Config
(
float
r
,
uint32_t
d
)
:
ratio
(
r
),
dim
(
d
),
training
(
true
)
{}
float
RATIO
()
const
{
return
training
?
ratio
:
0.0
;
}
inline
void
SetDim
(
uint32_t
d
)
{
dim
=
d
;
}
};
Dropout
(
const
Config
&
config
)
:
_config
(
config
),
_mask
(
nullptr
)
{}
virtual
~
Dropout
()
{}
void
Forward
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
hipStream_t
stream
,
bool
bwd
=
false
)
{
launch_dropout
<
T
>
(
out
,
vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
dim
,
_config
.
RATIO
(),
stream
,
bwd
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
vals
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_dropout
<
T
>
(
vals
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_dropout
<
T
>
(
out
,
vals
,
residual
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals
,
hipStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals_out
,
const
T
*
d_vals
,
hipStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals_out
,
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
bool
HasDropout
()
const
{
return
_config
.
RATIO
()
>
0.0
;
}
void
SetTrainingMode
(
bool
training
)
{
_config
.
training
=
training
;
}
void
SetMask
(
uint8_t
*
mask
)
{
if
(
!
mask
)
{
throw
std
::
runtime_error
(
"Dropout mask is null."
);
}
_mask
=
mask
;
}
Config
GetConfig
()
const
{
return
_config
;
}
inline
void
SetDimension
(
uint32_t
dim
)
{
_config
.
SetDim
(
dim
);
}
private:
uint8_t
*
_mask
;
Config
_config
;
};
csrc/includes/ds_transformer_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime_api.h>
#include <hiprand/hiprand.h>
#include <memory>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "dropout_hip.h"
#include "feed_forward_hip.h"
#include "gelu_hip.h"
#include "general_kernels_hip.h"
#include "normalize_layer_hip.h"
#include "softmax_hip.h"
#include "strided_batch_gemm_hip.h"
struct
BertGemmAlgos
{
int
m_gemm_qkv_algo
;
int
m_gemm_inter_algo
;
int
m_gemm_output_algo
;
int
m_gemm_batch1_algo
;
int
m_gemm_batch2_algo
;
BertGemmAlgos
()
:
m_gemm_qkv_algo
(
-
1
),
m_gemm_inter_algo
(
-
1
),
m_gemm_output_algo
(
-
1
),
m_gemm_batch1_algo
(
-
1
),
m_gemm_batch2_algo
(
-
1
)
{
}
};
template
<
typename
T
>
class
BertTransformerLayer
{
public:
BertTransformerLayer
(
unsigned
layer_id
,
unsigned
batch_size
,
unsigned
hidden_size
,
unsigned
num_heads
,
unsigned
intermediate_size
,
unsigned
seq_length
,
float
attn_dropout_ratio
,
float
hidden_output_dropout_ratio
,
float
layer_norm_eps
,
bool
pre_or_postLayerNorm
,
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
gemm_algos
,
bool
attn_dropout_checkpoint
,
bool
normalize_invertible
,
bool
gelu_checkpoint
,
bool
stochastic_mode
);
virtual
~
BertTransformerLayer
();
void
Forward
(
unsigned
bsz
,
const
T
*
input_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_qkvb_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_ob_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
output_b_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
out_ptr
,
T
*
inp_norm_ptr
,
T
*
q_tf_ptr
,
T
*
k_tf_ptr
,
T
*
v_tf_ptr
,
T
*
softmax_output_ptr
,
T
*
ctx_bufB_ptr
,
T
*
attn_o_inp_ptr
,
T
*
add_res_ptr
,
T
*
ff1_inp_ptr
,
T
*
gelu_inp_ptr
,
T
*
ff2_inp_ptr
);
void
Backward
(
unsigned
bsz
,
const
T
*
grad_output_ptr
,
const
T
*
input_ptr
,
const
T
*
output_ptr
,
const
T
*
inp_norm_ptr
,
const
T
*
q_tf_ptr
,
const
T
*
k_tf_ptr
,
const
T
*
v_tf_ptr
,
const
T
*
softmax_output_ptr
,
const
T
*
ctx_bufB_ptr
,
const
T
*
attn_o_inp_ptr
,
const
T
*
add_res_ptr
,
const
T
*
ff1_inp_ptr
,
const
T
*
gelu_inp_ptr
,
const
T
*
ff2_inp_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
grad_input_ptr
,
T
*
grad_attn_qkvw_ptr
,
T
*
grad_attn_qkvb_ptr
,
T
*
grad_attn_ow_ptr
,
T
*
grad_attn_ob_ptr
,
T
*
grad_attn_nw_ptr
,
T
*
grad_attn_nb_ptr
,
T
*
grad_inter_w_ptr
,
T
*
grad_inter_b_ptr
,
T
*
grad_output_w_ptr
,
T
*
grad_output_b_ptr
,
T
*
grad_norm_w_ptr
,
T
*
grad_norm_b_ptr
);
void
SetIntermediateBuffers
(
uint8_t
*
attn_prob_dropout_mask_ptr
,
uint8_t
*
attn_output_dropout_mask_ptr
,
uint8_t
*
layer_output_dropout_mask_ptr
,
T
*
layer_norm_var
,
T
*
layer_norm_mean
,
T
*
attn_layer_norm_var
,
T
*
attn_layer_norm_mean
);
inline
unsigned
GetBatchSize
()
const
{
return
_batch_size
;
}
inline
unsigned
GetNumHeads
()
const
{
return
_heads
;
}
inline
unsigned
GetSeqLength
()
const
{
return
_seq_length
;
}
inline
unsigned
GetIntermediateSize
()
const
{
return
_intermediate_size
;
}
void
SetSeqLength
(
unsigned
seq_len
);
inline
unsigned
GetHiddenSize
()
const
{
return
_hidden_size
;
}
void
SetTrainingMode
(
bool
training
);
inline
bool
IsTrainingMode
()
const
{
return
_training
;
}
inline
bool
GeluCheckpoint
()
const
{
return
_gelu_checkpoint
;
}
private:
void
Initialize
();
size_t
getWorkspaceSize
(
int
maxBatchSize
)
const
;
// Params
unsigned
_layer_id
;
unsigned
_batch_size
;
unsigned
_hidden_size
;
unsigned
_heads
;
unsigned
_size_per_head
;
unsigned
_intermediate_size
;
unsigned
_seq_length
;
bool
_pre_or_postLayerNorm
;
rocblas_handle
_cublasHandle
;
hipStream_t
_stream
;
// layers
FeedForward
<
T
>
_qkv_linear
;
FeedForward
<
T
>
_attn_out_linear
;
Normalize_Layer
<
T
>
_attn_layer_norm
;
Normalize_Layer
<
T
>
_layer_norm
;
Normalize_Layer
<
T
>*
_last_normalize
;
FeedForward
<
T
>
_ff1
,
_ff2
;
Softmax
<
T
>
_softmax
;
Gelu
<
T
>
_gelu
;
Dropout
<
T
>
_attn_prob_dropout
;
Dropout
<
T
>
_attn_output_dropout
;
Dropout
<
T
>
_layer_output_dropout
;
StridedBatchGemm
<
T
>
_attn_scores
;
StridedBatchGemm
<
T
>
_attn_context
;
bool
_training
;
// Memory saving flags
bool
_attn_dropout_checkpoint
;
bool
_normalize_invertible
;
bool
_gelu_checkpoint
;
// High Performance flags
bool
_stochastic_mode
;
};
csrc/includes/feed_forward_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#ifndef __FEEDFORWARD_H__
#define __FEEDFORWARD_H__
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_hip_layers.h"
template
<
typename
T
>
class
FeedForward
{
public:
struct
Config
{
int
batchSize
,
outputSize
;
int
inputSize
;
std
::
array
<
int
,
3
>
gemm_algos
;
Config
(
int
batch
,
int
outputs
,
int
inputs
,
const
std
::
array
<
int
,
3
>&
algos
)
:
batchSize
(
batch
),
outputSize
(
outputs
),
inputSize
(
inputs
),
gemm_algos
(
algos
)
{
}
};
FeedForward
(
Config
config
)
:
config_
(
config
)
{}
~
FeedForward
()
{}
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
out
,
rocblas_handle
&
_cublasHandle
)
{
float
alpha
=
T
(
1.
);
float
beta
=
T
(
0.
);
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_transpose
,
rocblas_operation_none
,
config_
.
outputSize
,
bsz
,
config_
.
inputSize
,
&
alpha
,
&
beta
,
weights
,
input_ptr
,
out
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
(
config_
.
gemm_algos
[
0
]));
#else
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
0
]));
#endif
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
weights_grad
,
T
*
bias_grad
,
rocblas_handle
&
_cublasHandle
,
hipStream_t
&
stream
,
T
*
inp_grad_out
=
nullptr
,
T
*
out_grad_trans_out
=
nullptr
)
{
float
alpha
=
(
T
)
1.0
,
beta
=
(
T
)
0.0
;
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_none
,
rocblas_operation_transpose
,
config_
.
inputSize
,
config_
.
outputSize
,
bsz
,
&
alpha
,
&
beta
,
input_ptr
,
out_grad
,
weights_grad
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
(
config_
.
gemm_algos
[
1
]));
#else
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
1
]));
#endif
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_none
,
rocblas_operation_none
,
config_
.
inputSize
,
bsz
,
config_
.
outputSize
,
&
alpha
,
&
beta
,
weights
,
out_grad
,
inp_grad_out
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
(
config_
.
gemm_algos
[
2
]));
#else
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
2
]));
#endif
launch_fuse_transpose_bias_kernel
<
T
>
(
out_grad
,
bias_grad
,
bsz
,
config_
.
outputSize
,
stream
);
}
private:
Config
config_
;
};
#endif
csrc/includes/gelu_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_hip_layers.h"
template
<
typename
T
>
class
Gelu
{
public:
struct
Config
{
uint32_t
intermediate_size
;
Config
(
uint32_t
inter_size
)
:
intermediate_size
(
inter_size
)
{}
};
Gelu
(
const
Config
&
config
)
:
_config
(
config
)
{}
virtual
~
Gelu
()
{}
void
ForwardWithBiasAdd
(
int
bsz
,
const
T
*
input_buf
,
const
T
*
bias
,
T
*
output
,
hipStream_t
stream
)
{
launch_bias_gelu
<
T
>
(
input_buf
,
bias
,
output
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_output
,
const
T
*
input_buf
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_d_gelu
<
T
>
(
d_output
,
input_buf
,
bias
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
private:
Config
_config
;
};
csrc/includes/gemm_test_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_fp16.h>
#ifndef __HIP_PLATFORM_HCC__
#include <cuda_profiler_api.h>
#endif
#include <array>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <limits>
#include <memory>
#include "StopWatch.h"
#include "cublas_wrappers_hip.h"
template
<
typename
T
>
void
check
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
)
{
if
(
result
)
{
std
::
cout
<<
(
std
::
string
(
"CUDA runtime error: "
)
+
+
file
+
":"
+
std
::
to_string
(
line
)
+
"
\n
"
);
}
}
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
template
<
typename
T
>
class
GemmTest
{
public:
GemmTest
(
int
m
,
int
n
,
int
k
,
rocblas_operation
ta
,
rocblas_operation
tb
,
rocblas_handle
h
)
:
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
hipMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
));
}
~
GemmTest
()
{
check_cuda_error
(
hipFree
(
A
));
check_cuda_error
(
hipFree
(
B
));
check_cuda_error
(
hipFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_transpose
,
rocblas_operation_none
,
N
,
M
,
K
,
&
alpha
,
&
beta
,
B
,
A
,
C
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_none
,
rocblas_operation_transpose
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_none
,
rocblas_operation_none
,
K
,
M
,
N
,
&
alpha
,
&
beta
,
B
,
C
,
A
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
#ifdef __HIP_PLATFORM_HCC__
for
(
int
algo
=
(
int
)
rocblas_gemm_algo_standard
;
algo
<=
(
int
)
rocblas_gemm_algo_standard
;
#else
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
#endif
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
M
,
N
,
K
;
rocblas_handle
handle
;
rocblas_operation
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
template
<
typename
T
>
class
StridedGemmTest
{
public:
StridedGemmTest
(
int
b
,
int
m
,
int
n
,
int
k
,
rocblas_operation
ta
,
rocblas_operation
tb
,
rocblas_handle
h
)
:
bsz
(
b
),
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
hipMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
*
bsz
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
*
bsz
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
*
bsz
));
}
~
StridedGemmTest
()
{
check_cuda_error
(
hipFree
(
A
));
check_cuda_error
(
hipFree
(
B
));
check_cuda_error
(
hipFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
stride_a
=
M
*
K
;
int
stride_b
=
N
*
K
;
int
stride_c
=
M
*
N
;
cublas_strided_batched_gemm
(
handle
,
M
,
N
,
K
,
&
alpha
,
&
beta
,
A
,
B
,
C
,
transa
,
transb
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
mb
=
(
transa
==
rocblas_operation_transpose
?
K
:
M
);
int
kb
=
(
transa
==
rocblas_operation_transpose
?
M
:
K
);
int
stride_a
=
mb
*
N
;
int
stride_b
=
N
*
kb
;
int
stride_c
=
M
*
K
;
// B need to transpose.
rocblas_operation
op_b
=
(
transb
==
rocblas_operation_transpose
?
rocblas_operation_none
:
rocblas_operation_transpose
);
// Calculate d_A.
cublas_strided_batched_gemm
(
handle
,
mb
,
kb
,
N
,
&
alpha
,
&
beta
,
(
transa
==
rocblas_operation_transpose
?
B
:
C
),
(
transa
==
rocblas_operation_transpose
?
C
:
B
),
A
,
rocblas_operation_none
,
op_b
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
// A need to transpose.
rocblas_operation
op_a
=
(
transa
==
rocblas_operation_transpose
?
rocblas_operation_none
:
rocblas_operation_transpose
);
int
stride_a
=
M
*
K
;
int
stride_b
=
M
*
N
;
int
stride_c
=
N
*
K
;
// Calculate d_B.
cublas_strided_batched_gemm
(
handle
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
op_a
,
rocblas_operation_none
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
#ifdef __HIP_PLATFORM_HCC__
for
(
int
algo
=
(
int
)
rocblas_gemm_algo_standard
;
algo
<=
(
int
)
rocblas_gemm_algo_standard
;
#else
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
#endif
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
bsz
,
M
,
N
,
K
;
rocblas_handle
handle
;
rocblas_operation
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
csrc/includes/general_kernels_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
#endif
#include <hiprand/hiprand_kernel.h>
#include "context_hip.h"
#include "cublas_wrappers_hip.h"
#define THREADS 256
#define TILE_DIM 32
#define minus_infinity -1 * std::numeric_limits<float>::infinity()
#define FINAL_MASK 0xffffffff
template
<
typename
T
>
void
launch_fused_add2
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
hipStream_t
&
stream
);
template
<
typename
T
>
void
launch_fused_add4
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
const
T
*
inp3
,
const
T
*
inp4
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
hipStream_t
&
stream
);
template
<
typename
T
>
void
launch_fused_add3
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
const
T
*
inp3
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
hipStream_t
&
stream
);
csrc/includes/normalize_layer_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <fstream>
#include "custom_hip_layers.h"
using
namespace
std
;
template
<
typename
T
>
class
Normalize_Layer
{
public:
struct
Config
{
uint32_t
batchSize
;
uint32_t
seqLength
;
uint32_t
hiddenDim
;
float
epsilon
;
bool
training
;
bool
useMean
;
Config
(
uint32_t
batch
,
uint32_t
seq
,
uint32_t
h
,
float
epsilon
=
1e-12
,
bool
training
=
true
,
bool
useMean
=
true
)
:
batchSize
(
batch
),
seqLength
(
seq
),
hiddenDim
(
h
),
epsilon
(
epsilon
),
training
(
training
),
useMean
(
useMean
)
{
}
};
Normalize_Layer
(
Config
config
)
:
config_
(
config
),
vars
(
nullptr
),
means
(
nullptr
),
vals_hat
(
nullptr
)
{
}
~
Normalize_Layer
()
{}
void
ForwardCheckpoint
(
int
bsz
,
// batch * seq
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
betta
,
hipStream_t
&
stream
,
bool
preLayerNorm
=
false
)
{
launch_bias_residual_layer_norm
(
vals
,
residual
,
gamma
,
betta
,
config_
.
epsilon
,
bsz
,
config_
.
hiddenDim
,
stream
,
preLayerNorm
,
config_
.
training
,
vars
,
means
);
}
void
Forward
(
int
bsz
,
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
betta
,
hipStream_t
&
stream
,
bool
preLayerNorm
=
false
)
{
launch_bias_residual_layer_norm
(
vals
,
residual
,
gamma
,
betta
,
config_
.
epsilon
,
bsz
,
config_
.
hiddenDim
,
stream
,
preLayerNorm
,
config_
.
training
,
vars
);
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
hipStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_in
=
nullptr
)
{
launch_layerNorm_backward
(
out_grad
,
norm_in
,
vars
,
means
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
);
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
gamma
,
const
T
*
betta
,
T
*
gamma_grad
,
T
*
betta_grad
,
hipStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_out
)
{
launch_layerNorm_backward
(
out_grad
,
norm_out
,
vars
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
,
!
config_
.
useMean
,
betta
);
}
void
BackwardFusedAdd
(
int
bsz
,
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
hipStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_in
=
nullptr
)
{
launch_layerNorm_backward_fused_add
(
out_grad1
,
out_grad2
,
norm_in
,
vars
,
means
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
);
}
void
BackwardFusedAdd
(
int
bsz
,
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
gamma
,
const
T
*
betta
,
T
*
gamma_grad
,
T
*
betta_grad
,
hipStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_out
)
{
launch_layerNorm_backward_fused_add
(
out_grad1
,
out_grad2
,
norm_out
,
vars
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
,
!
config_
.
useMean
,
betta
);
}
inline
bool
UseMean
()
const
{
return
config_
.
useMean
;
}
inline
void
SetVar
(
T
*
variance
)
{
if
(
!
variance
)
{
throw
std
::
runtime_error
(
"Normalize variance is null."
);
}
vars
=
variance
;
}
inline
void
SetMean
(
T
*
mean
)
{
if
(
!
mean
)
{
throw
std
::
runtime_error
(
"Normalize mean is null."
);
}
means
=
mean
;
}
private:
Config
config_
;
T
*
vars
;
T
*
means
;
T
*
vals_hat
;
};
csrc/includes/quantizer_hip.h
deleted
100644 → 0
View file @
7dd68788
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <cooperative_groups.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cassert>
#include <iostream>
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment