Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
FastMoE
Commits
1f82fb16
Unverified
Commit
1f82fb16
authored
Sep 11, 2023
by
Rick Ho
Committed by
GitHub
Sep 11, 2023
Browse files
Merge pull request #173 from laekov/fit-new-smgr
Fit old code with new smgr
parents
2bd187cb
945004e7
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
38 additions
and
32 deletions
+38
-32
cuda/balancing.cu
cuda/balancing.cu
+6
-2
cuda/balancing.cuh
cuda/balancing.cuh
+2
-4
cuda/fastermoe/smart_schedule.cpp
cuda/fastermoe/smart_schedule.cpp
+1
-2
cuda/fastermoe/smart_schedule.h
cuda/fastermoe/smart_schedule.h
+8
-10
cuda/global_exchange.cpp
cuda/global_exchange.cpp
+2
-3
cuda/global_exchange.h
cuda/global_exchange.h
+4
-6
cuda/local_exchange.cuh
cuda/local_exchange.cuh
+2
-4
cuda/parallel_linear.cuh
cuda/parallel_linear.cuh
+2
-0
cuda/stream_manager.cpp
cuda/stream_manager.cpp
+8
-0
cuda/stream_manager.h
cuda/stream_manager.h
+2
-0
fmoe/fastermoe/schedule.py
fmoe/fastermoe/schedule.py
+1
-1
No files found.
cuda/balancing.cu
View file @
1f82fb16
...
...
@@ -104,6 +104,7 @@ std::vector<torch::Tensor> _swipe_once(
}
long
*
d_lec
=
_h2d
(
lec
,
n_worker
),
*
d_gec
=
_cudamalloc
<
long
>
(
n_worker
);
fmoe_cuda_expert_exchange_impl
(
d_lec
,
d_gec
,
1
,
n_worker
,
smgr
);
smgr
->
syncTorch
();
long
*
gec
=
_d2h
(
d_gec
,
n_worker
);
/* Limit number of incoming samples */
...
...
@@ -123,17 +124,20 @@ std::vector<torch::Tensor> _swipe_once(
/* Send limit information back */
_h2d
(
gec
,
d_gec
,
n_worker
);
fmoe_cuda_expert_exchange_impl
(
d_gec
,
d_lec
,
1
,
n_worker
,
smgr
);
smgr
->
syncTorch
();
_d2h
(
d_lec
,
lec
,
n_worker
);
auto
d_dropcount
=
_h2d
(
drop_count
,
n_worker
);
ncclAllReduce
(
d_dropcount
,
d_dropcount
,
n_worker
,
ncclInt64
,
ncclSum
,
smgr
->
ncclcomm
,
smgr
->
stream
());
smgr
->
ncclcomm
,
smgr
->
torchStream
());
smgr
->
syncTorch
();
_d2h
(
d_dropcount
,
drop_count
,
n_worker
);
auto
d_gcap
=
_cudamalloc
<
long
>
(
n_worker
);
_h2d
(
&
cap
,
d_gcap
+
rank
,
1
);
ncclAllGather
(
d_gcap
+
rank
,
d_gcap
,
1
,
ncclInt64
,
smgr
->
ncclcomm
,
smgr
->
stream
());
smgr
->
ncclcomm
,
smgr
->
torchStream
());
smgr
->
syncTorch
();
auto
gcap
=
_d2h
(
d_gcap
,
n_worker
);
/* Re-assign and update counters */
...
...
cuda/balancing.cuh
View file @
1f82fb16
...
...
@@ -25,9 +25,8 @@ void fmoe_cuda_limit_by_capacity_impl(const long* ec, int* cap,
CudaStreamManager
*
smgr
)
{
dim3
grid_dim
(
CEIL
(
n_worker
,
1024
),
n_expert
);
dim3
block_dim
(
1024
);
limit_by_capacity_kernel
<<<
grid_dim
,
block_dim
,
0
,
smgr
->
s
tream
(
0
)
>>>
(
limit_by_capacity_kernel
<<<
grid_dim
,
block_dim
,
0
,
smgr
->
torchS
tream
()
>>>
(
ec
,
cap
,
eca
,
n_expert
,
n_worker
);
smgr
->
sync
(
1
);
}
__global__
...
...
@@ -51,8 +50,7 @@ void fmoe_cuda_prune_gate_by_capacity_impl(long* gate_idx, long* new_gate_idx,
CudaStreamManager
*
smgr
)
{
dim3
grid_dim
(
CEIL
(
batch_size
,
1024
));
dim3
block_dim
(
1024
);
prune_gate_by_capacity_kernel
<<<
grid_dim
,
block_dim
,
0
,
smgr
->
s
tream
(
0
)
>>>
(
prune_gate_by_capacity_kernel
<<<
grid_dim
,
block_dim
,
0
,
smgr
->
torchS
tream
()
>>>
(
gate_idx
,
new_gate_idx
,
ec
,
batch_size
,
n_expert
,
n_worker
);
smgr
->
sync
(
1
);
}
cuda/fastermoe/smart_schedule.cpp
View file @
1f82fb16
...
...
@@ -44,10 +44,9 @@ void _reduce_grad(
long
expert_size
)
{
auto
smgr
=
getCudaStreamManager
(
t
.
device
().
index
());
auto
torch_stream
=
c10
::
cuda
::
getCurrentCUDAStream
().
stream
();
cudaEvent_t
evt_stash
;
cudaEventCreate
(
&
evt_stash
);
cudaEventRecord
(
evt_stash
,
torch
_s
tream
);
cudaEventRecord
(
evt_stash
,
smgr
->
torch
S
tream
()
);
FMOE_SWE
(
smgr
->
stream
(
0
),
evt_stash
);
cudaEventDestroy
(
evt_stash
);
...
...
cuda/fastermoe/smart_schedule.h
View file @
1f82fb16
...
...
@@ -122,8 +122,7 @@ void fmoe_cuda_fused_forward_impl(
long
d_model
,
long
num_expert
,
long
rank
,
long
world_size
,
long
expert_size
,
long
pipeline_gran
,
CudaStreamManager
*
smgr
)
{
auto
torch_stream
=
c10
::
cuda
::
getCurrentCUDAStream
().
stream
();
cudaStreamSynchronize
(
torch_stream
);
smgr
->
syncTorch
();
int
*
local_ptr
=
new
int
[
num_expert
*
world_size
+
1
];
int
*
global_ptr
=
new
int
[
num_expert
*
world_size
+
1
];
...
...
@@ -192,7 +191,7 @@ void fmoe_cuda_fused_forward_impl(
// C_0 ... C_n
for
(
long
step
=
0
;
step
<
n_groups
;
++
step
)
{
FMOE_SWE
(
smgr
->
stream
(
0
),
input_ready
[
step
]);
FMOE_SWE
(
torch
_s
tream
,
input_ready
[
step
]);
FMOE_SWE
(
smgr
->
torch
S
tream
()
,
input_ready
[
step
]);
for
(
int
ei
=
0
;
ei
<
num_expert
;
++
ei
)
{
GEN_BASE
(
step
);
long
offset
=
global_ptr
[
ei
*
world_size
+
from_base
];
...
...
@@ -203,14 +202,14 @@ void fmoe_cuda_fused_forward_impl(
(
long
)
ei
,
step
*
num_expert
+
ei
,
offset
,
micro_batch_size
,
d_model
,
smgr
);
}
cudaEventRecord
(
output_ready
[
step
],
smgr
->
stream
(
0
));
cudaEventRecord
(
output_torch_ready
[
step
],
torch
_s
tream
);
cudaEventRecord
(
output_torch_ready
[
step
],
smgr
->
torch
S
tream
()
);
}
// Compute over shadowed experts
for
(
long
i
=
0
,
si
=
0
;
i
<
world_size
*
num_expert
;
++
i
)
{
if
(
stored_models
[
i
])
{
FMOE_SWE
(
smgr
->
stream
(
0
),
evt_shadow
[
si
]);
FMOE_SWE
(
torch
_s
tream
,
evt_shadow
[
si
]);
FMOE_SWE
(
smgr
->
torch
S
tream
()
,
evt_shadow
[
si
]);
stash_fn
(
params
[
si
],
si
,
0
);
// always put shadowed expert at first, so expert_idx = 0
long
offset
=
local_ptr
[
i
];
long
micro_batch_size
=
local_expert_count
[
i
];
...
...
@@ -282,8 +281,7 @@ void fmoe_cuda_fused_backward_impl(
long
d_model
,
long
num_expert
,
long
rank
,
long
world_size
,
long
pipeline_gran
,
CudaStreamManager
*
smgr
)
{
auto
torch_stream
=
c10
::
cuda
::
getCurrentCUDAStream
().
stream
();
cudaStreamSynchronize
(
torch_stream
);
smgr
->
syncTorch
();
int
*
local_ptr
=
new
int
[
num_expert
*
world_size
+
1
];
int
*
global_ptr
=
new
int
[
num_expert
*
world_size
+
1
];
...
...
@@ -350,7 +348,7 @@ void fmoe_cuda_fused_backward_impl(
// C_0 ... C_n
for
(
long
step
=
0
;
step
<
n_groups
;
++
step
)
{
FMOE_SWE
(
smgr
->
stream
(
0
),
input_ready
[
step
]);
FMOE_SWE
(
torch
_s
tream
,
input_ready
[
step
]);
FMOE_SWE
(
smgr
->
torch
S
tream
()
,
input_ready
[
step
]);
for
(
int
ei
=
0
;
ei
<
num_expert
;
++
ei
)
{
GEN_BASE
(
step
);
long
offset
=
global_ptr
[
ei
*
world_size
+
from_base
];
...
...
@@ -362,7 +360,7 @@ void fmoe_cuda_fused_backward_impl(
(
long
)
ei
,
step
*
num_expert
+
ei
,
offset
,
micro_batch_size
,
d_model
,
smgr
);
}
cudaEventRecord
(
output_ready
[
step
],
smgr
->
stream
(
0
));
cudaEventRecord
(
output_torch_ready
[
step
],
torch
_s
tream
);
cudaEventRecord
(
output_torch_ready
[
step
],
smgr
->
torch
S
tream
()
);
}
// Collect gradients for shadowed experts
...
...
@@ -370,7 +368,7 @@ void fmoe_cuda_fused_backward_impl(
if
(
stored_models
[
i
])
{
if
(
i
/
num_expert
==
rank
)
{
FMOE_SWE
(
smgr
->
stream
(
0
),
evt_reduce
[
i
%
num_expert
]);
FMOE_SWE
(
torch
_s
tream
,
evt_reduce
[
i
%
num_expert
]);
FMOE_SWE
(
smgr
->
torch
S
tream
()
,
evt_reduce
[
i
%
num_expert
]);
set_grad_fn
(
si
,
i
%
num_expert
);
}
++
si
;
...
...
cuda/global_exchange.cpp
View file @
1f82fb16
...
...
@@ -19,17 +19,16 @@ void fmoe_cuda_expert_exchange_impl(
ncclInt64
,
i
,
smgr
->
ncclcomm
,
smgr
->
s
tream
(
0
)));
smgr
->
torchS
tream
()));
NCCL_SAFE_CALL
(
ncclRecv
(
global_expert_count
+
n_expert
*
i
,
n_expert
,
ncclInt64
,
i
,
smgr
->
ncclcomm
,
smgr
->
s
tream
(
0
)));
smgr
->
torchS
tream
()));
}
NCCL_SAFE_CALL
(
ncclGroupEnd
());
smgr
->
sync
(
1
);
}
torch
::
Tensor
_expert_exchange
(
...
...
cuda/global_exchange.h
View file @
1f82fb16
...
...
@@ -36,7 +36,7 @@ void fmoe_cuda_global_scatter_impl(
ncclChar
,
j
,
smgr
->
ncclcomm
,
smgr
->
s
tream
(
0
)));
smgr
->
torchS
tream
()));
}
if
(
global_expert_count
[
idx
])
{
NCCL_SAFE_CALL
(
ncclRecv
(
...
...
@@ -45,14 +45,13 @@ void fmoe_cuda_global_scatter_impl(
ncclChar
,
j
,
smgr
->
ncclcomm
,
smgr
->
s
tream
(
0
)));
smgr
->
torchS
tream
()));
recv_ptr
+=
global_expert_count
[
idx
];
}
}
NCCL_SAFE_CALL
(
ncclGroupEnd
());
}
delete
[]
expert_ptr
;
smgr
->
sync
(
1
);
}
template
<
typename
scalar_t
>
...
...
@@ -82,7 +81,7 @@ void fmoe_cuda_global_gather_impl(
ncclChar
,
j
,
smgr
->
ncclcomm
,
smgr
->
s
tream
(
0
)));
smgr
->
torchS
tream
()));
send_ptr
+=
global_expert_count
[
idx
];
}
if
(
local_expert_count
[
idx
])
{
...
...
@@ -92,13 +91,12 @@ void fmoe_cuda_global_gather_impl(
ncclChar
,
j
,
smgr
->
ncclcomm
,
smgr
->
s
tream
(
0
)));
smgr
->
torchS
tream
()));
}
}
NCCL_SAFE_CALL
(
ncclGroupEnd
());
}
delete
[]
expert_ptr
;
smgr
->
sync
(
1
);
}
...
...
cuda/local_exchange.cuh
View file @
1f82fb16
...
...
@@ -21,9 +21,8 @@ void fmoe_cuda_assign_pos_impl(
CudaStreamManager
*
smgr
)
{
size_t
numel
=
batch_size
*
topk
;
assign_pos_kernel
<<<
CEIL
(
numel
,
256
),
256
,
0
,
smgr
->
s
tream
(
0
)
>>>
<<<
CEIL
(
numel
,
256
),
256
,
0
,
smgr
->
torchS
tream
()
>>>
(
cum_count
,
gate
,
pos
,
numel
,
topk
);
smgr
->
sync
(
1
);
}
#define PERTHREAD_EXPERTS 256
...
...
@@ -74,7 +73,6 @@ void fmoe_cuda_expert_count_impl(
const
size_t
batch_size
,
const
size_t
n_expert
,
CudaStreamManager
*
smgr
)
{
expert_count_kernel
<<<
CEIL
(
n_expert
,
PERTHREAD_EXPERTS
),
256
,
0
,
smgr
->
s
tream
(
0
)
>>>
<<<
CEIL
(
n_expert
,
PERTHREAD_EXPERTS
),
256
,
0
,
smgr
->
torchS
tream
()
>>>
(
gate_idx
,
expert_count
,
batch_size
,
n_expert
);
smgr
->
sync
(
1
);
}
cuda/parallel_linear.cuh
View file @
1f82fb16
...
...
@@ -65,6 +65,7 @@ void fmoe_cuda_linear_forward_impl(
CudaStreamManager
*
smgr
)
{
scalar_t
alpha
=
1
,
beta
=
has_bias
?
1
:
0
;
smgr
->
syncTorch
();
for
(
int
i
=
0
,
ptr
=
0
;
i
<
num_expert
;
++
i
)
{
if
(
expert_count
[
i
]
==
0
)
{
continue
;
...
...
@@ -102,6 +103,7 @@ void fmoe_cuda_linear_backward_impl(
const
size_t
out_feat
,
const
size_t
num_expert
,
CudaStreamManager
*
smgr
)
{
smgr
->
syncTorch
();
scalar_t
alpha
=
1
,
beta
=
0
;
// bias
...
...
cuda/stream_manager.cpp
View file @
1f82fb16
...
...
@@ -19,6 +19,10 @@ cudaStream_t CudaStreamManager::stream(size_t idx) {
return
this
->
streams
[
idx
%
SMGR_N_STREAMS
];
}
cudaStream_t
CudaStreamManager
::
torchStream
()
{
return
c10
::
cuda
::
getCurrentCUDAStream
().
stream
();
}
cublasHandle_t
CudaStreamManager
::
handle
(
size_t
idx
)
{
if
(
this
->
use_default
)
{
return
at
::
cuda
::
getCurrentCUDABlasHandle
();
...
...
@@ -27,6 +31,10 @@ cublasHandle_t CudaStreamManager::handle(size_t idx) {
}
void
CudaStreamManager
::
syncTorch
()
{
cudaStreamSynchronize
(
this
->
torchStream
());
}
void
CudaStreamManager
::
sync
(
int
idx
)
{
if
(
this
->
use_default
)
{
return
;
...
...
cuda/stream_manager.h
View file @
1f82fb16
...
...
@@ -34,8 +34,10 @@ public:
void
setup
(
int
);
void
sync
(
int
=
0
);
void
syncTorch
();
void
destroy
();
cudaStream_t
torchStream
();
cudaStream_t
stream
(
size_t
=
0
);
cublasHandle_t
handle
(
size_t
=
0
);
...
...
fmoe/fastermoe/schedule.py
View file @
1f82fb16
...
...
@@ -37,7 +37,7 @@ class MoEForward(Function):
try
:
# To skip torch autograd's version check.
with
torch
.
autograd
.
graph
.
saved_tensors_hooks
(
nothing
,
nothing
):
y0
=
expert_fn
(
x
,
torch
.
tensor
([
x
.
shape
[
0
]],
dtype
=
torch
.
int64
))
y0
=
expert_fn
(
x
,
torch
.
tensor
([
x
.
shape
[
0
]],
dtype
=
torch
.
int64
)
,
expert_idx
)
except
Exception
as
e
:
# Ignore the error and fall back for compatibility to older
# versions of PyTorch
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment