Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
8823cc48
Unverified
Commit
8823cc48
authored
Jan 29, 2024
by
Frank Lee
Committed by
GitHub
Jan 29, 2024
Browse files
Merge pull request #5310 from hpcaitech/feature/npu
Feature/npu
parents
bce9499e
73f4dc57
Changes
266
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
108 additions
and
100 deletions
+108
-100
colossalai/utils/timer.py
colossalai/utils/timer.py
+4
-4
colossalai/zero/gemini/chunk/chunk.py
colossalai/zero/gemini/chunk/chunk.py
+14
-14
colossalai/zero/gemini/chunk/manager.py
colossalai/zero/gemini/chunk/manager.py
+8
-4
colossalai/zero/gemini/gemini_ddp.py
colossalai/zero/gemini/gemini_ddp.py
+4
-3
colossalai/zero/gemini/gemini_optimizer.py
colossalai/zero/gemini/gemini_optimizer.py
+8
-7
colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py
...lai/zero/gemini/memory_tracer/chunk_memstats_collector.py
+2
-2
colossalai/zero/gemini/memory_tracer/memory_monitor.py
colossalai/zero/gemini/memory_tracer/memory_monitor.py
+3
-3
colossalai/zero/gemini/placement_policy.py
colossalai/zero/gemini/placement_policy.py
+5
-5
colossalai/zero/gemini/utils.py
colossalai/zero/gemini/utils.py
+3
-3
colossalai/zero/low_level/low_level_optim.py
colossalai/zero/low_level/low_level_optim.py
+16
-17
docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
.../advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+2
-3
docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
.../advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+1
-2
examples/community/roberta/pretraining/run_pretraining.py
examples/community/roberta/pretraining/run_pretraining.py
+7
-4
examples/images/dreambooth/train_dreambooth_colossalai.py
examples/images/dreambooth/train_dreambooth_colossalai.py
+6
-6
examples/images/dreambooth/train_dreambooth_colossalai_lora.py
...les/images/dreambooth/train_dreambooth_colossalai_lora.py
+6
-6
examples/images/resnet/train.py
examples/images/resnet/train.py
+3
-3
examples/images/vit/vit_benchmark.py
examples/images/vit/vit_benchmark.py
+3
-2
examples/inference/benchmark_llama.py
examples/inference/benchmark_llama.py
+5
-6
examples/inference/run_llama_inference.py
examples/inference/run_llama_inference.py
+2
-2
examples/language/bert/finetune.py
examples/language/bert/finetune.py
+6
-4
No files found.
colossalai/utils/timer.py
View file @
8823cc48
...
...
@@ -3,7 +3,7 @@
import
time
from
typing
import
Tuple
from
.device
import
synchronize
from
colossalai.accelerator
import
get_accelerator
class
Timer
:
...
...
@@ -21,13 +21,13 @@ class Timer:
@
property
def
current_time
(
self
)
->
float
:
synchronize
()
get_accelerator
().
synchronize
()
return
time
.
time
()
def
start
(
self
):
"""Firstly synchronize cuda, reset the clock and then start the timer."""
self
.
_elapsed
=
0
synchronize
()
get_accelerator
().
synchronize
()
self
.
_start_time
=
time
.
time
()
self
.
_started
=
True
...
...
@@ -44,7 +44,7 @@ class Timer:
Returns:
int: Start-stop interval.
"""
synchronize
()
get_accelerator
().
synchronize
()
end_time
=
time
.
time
()
elapsed
=
end_time
-
self
.
_start_time
if
keep_in_history
:
...
...
colossalai/zero/gemini/chunk/chunk.py
View file @
8823cc48
...
...
@@ -6,8 +6,7 @@ import torch
import
torch.distributed
as
dist
from
torch.distributed
import
ProcessGroup
from
colossalai.utils
import
get_current_device
from
colossalai.utils.device
import
IS_NPU_AVAILABLE
from
colossalai.accelerator
import
get_accelerator
class
TensorState
(
Enum
):
...
...
@@ -107,7 +106,7 @@ class Chunk:
self
.
valid_end
=
self
.
shard_size
self
.
dtype
=
dtype
device
=
init_device
or
get_current_device
()
device
=
init_device
or
get_accelerator
().
get_current_device
()
# chunk_temp is a global chunk, which only exists during building the chunks.
self
.
chunk_temp
=
torch
.
zeros
(
chunk_size
,
dtype
=
dtype
,
device
=
device
)
# keep all zero
...
...
@@ -125,7 +124,7 @@ class Chunk:
# configure the init device of the shard
# no-offload default: fp16, fp32 -> CUDA
# offload default: fp16, fp32 -> CPU
self
.
shard_device
=
torch
.
device
(
"cpu"
)
if
cpu_shard_init
else
get_current_device
()
self
.
shard_device
=
torch
.
device
(
"cpu"
)
if
cpu_shard_init
else
get_accelerator
().
get_current_device
()
self
.
chunk_mem
=
self
.
chunk_size
*
self
.
chunk_temp
.
element_size
()
self
.
shard_mem
=
self
.
chunk_mem
//
self
.
pg_size
...
...
@@ -191,9 +190,8 @@ class Chunk:
def
device_type
(
self
)
->
str
:
if
self
.
chunk_temp
is
not
None
:
return
self
.
chunk_temp
.
device
.
type
else
:
if
self
.
is_gathered
or
self
.
cuda_shard
is
not
None
:
return
"npu"
if
IS_NPU_AVAILABLE
else
"cuda"
elif
self
.
is_gathered
or
self
.
cuda_shard
is
not
None
:
return
get_accelerator
().
name
else
:
return
"cpu"
...
...
@@ -297,7 +295,7 @@ class Chunk:
self
.
valid_end
=
self
.
utilized_size
-
self
.
shard_begin
if
self
.
chunk_temp
.
device
.
type
==
"cpu"
:
self
.
cuda_global_chunk
=
self
.
chunk_temp
.
to
(
get_current_device
())
self
.
cuda_global_chunk
=
self
.
chunk_temp
.
to
(
get_
accelerator
().
get_
current_device
())
self
.
__update_tensors_ptr
()
else
:
self
.
cuda_global_chunk
=
self
.
chunk_temp
...
...
@@ -334,12 +332,12 @@ class Chunk:
return
if
device
.
type
==
"cuda"
or
device
.
type
==
"npu"
:
assert
device
==
get_current_device
(),
"can't move chunk to another device"
assert
device
==
get_accelerator
().
get_current_device
(),
"can't move chunk to another device"
if
self
.
cuda_shard
:
return
self
.
cuda_shard
=
self
.
cpu_shard
.
to
(
get_current_device
())
self
.
cuda_shard
=
self
.
cpu_shard
.
to
(
get_
accelerator
().
get_
current_device
())
if
not
self
.
pin_memory
:
self
.
cpu_shard
=
None
...
...
@@ -394,7 +392,9 @@ class Chunk:
if
self
.
extra_dp_group
is
not
None
:
dist
.
all_reduce
(
self
.
cuda_global_chunk
,
group
=
self
.
extra_dp_group
)
else
:
self
.
cuda_shard
=
torch
.
empty
(
self
.
shard_size
,
dtype
=
self
.
dtype
,
device
=
get_current_device
())
self
.
cuda_shard
=
torch
.
empty
(
self
.
shard_size
,
dtype
=
self
.
dtype
,
device
=
get_accelerator
().
get_current_device
()
)
input_list
=
list
(
torch
.
chunk
(
self
.
cuda_global_chunk
,
chunks
=
self
.
pg_size
,
dim
=
0
))
dist
.
reduce_scatter
(
self
.
cuda_shard
,
input_list
,
group
=
self
.
torch_pg
)
...
...
@@ -533,7 +533,7 @@ class Chunk:
# only be called when optimizer state is in CPU memory
# the grad and param should be in the same device
assert
self
.
cuda_shard
is
None
temp
=
optim_chunk
.
cpu_shard
.
to
(
get_current_device
())
temp
=
optim_chunk
.
cpu_shard
.
to
(
get_
accelerator
().
get_
current_device
())
# avoid to transform FP32 in CPU
self
.
cuda_shard
=
temp
.
to
(
self
.
dtype
)
...
...
@@ -631,7 +631,7 @@ class Chunk:
grad_chunk
.
valid_end
=
self
.
valid_end
if
grad_chunk
.
chunk_temp
.
device
.
type
==
"cpu"
:
grad_chunk
.
cuda_global_chunk
=
grad_chunk
.
chunk_temp
.
to
(
get_current_device
())
grad_chunk
.
cuda_global_chunk
=
grad_chunk
.
chunk_temp
.
to
(
get_
accelerator
().
get_
current_device
())
else
:
grad_chunk
.
cuda_global_chunk
=
grad_chunk
.
chunk_temp
grad_chunk
.
chunk_temp
=
None
...
...
colossalai/zero/gemini/chunk/manager.py
View file @
8823cc48
...
...
@@ -5,7 +5,8 @@ import torch
import
torch.distributed
as
dist
from
torch.distributed
import
ProcessGroup
from
colossalai.utils
import
free_storage
,
get_current_device
from
colossalai.accelerator
import
get_accelerator
from
colossalai.utils
import
free_storage
from
.chunk
import
Chunk
,
ChunkFullError
,
TensorState
...
...
@@ -20,7 +21,7 @@ class ChunkManager:
"""
def
__init__
(
self
,
chunk_configuration
,
init_device
:
Optional
[
torch
.
device
]
=
None
)
->
None
:
self
.
device
=
init_device
or
get_current_device
()
self
.
device
=
init_device
or
get_accelerator
().
get_current_device
()
self
.
dp_degree_chunk_size_dict
:
Dict
[
int
,
int
]
=
dict
()
self
.
kwargs_config
=
chunk_configuration
for
k
,
v
in
self
.
kwargs_config
.
items
():
...
...
@@ -107,7 +108,7 @@ class ChunkManager:
return
self
.
__sub_memory_usage
(
chunk
.
memory_usage
)
if
chunk
.
device_type
==
"cpu"
:
chunk
.
shard_move
(
get_current_device
())
chunk
.
shard_move
(
get_
accelerator
().
get_
current_device
())
self
.
__add_accessed_chunk
(
chunk
)
self
.
__add_memory_usage
(
chunk
.
memory_usage
)
...
...
@@ -276,7 +277,10 @@ class ChunkManager:
accumulated_grad
=
chunk
.
grad_chunk
.
cuda_shard
.
clone
().
detach
().
mul_
(
chunk
.
pg_size
)
else
:
accumulated_grad
=
(
chunk
.
grad_chunk
.
cpu_shard
.
to
(
get_current_device
()).
clone
().
detach
().
mul_
(
chunk
.
pg_size
)
chunk
.
grad_chunk
.
cpu_shard
.
to
(
get_accelerator
().
get_current_device
())
.
clone
()
.
detach
()
.
mul_
(
chunk
.
pg_size
)
)
accumulated_grad_gathered
=
False
...
...
colossalai/zero/gemini/gemini_ddp.py
View file @
8823cc48
...
...
@@ -10,6 +10,7 @@ import torch.nn as nn
from
torch.distributed
import
ProcessGroup
from
torch.distributed.distributed_c10d
import
_get_default_group
from
colossalai.accelerator
import
get_accelerator
from
colossalai.checkpoint_io.utils
import
StateDictSharder
,
gather_distributed_param
from
colossalai.interface
import
ModelWrapper
from
colossalai.lazy
import
LazyTensor
...
...
@@ -27,7 +28,7 @@ from colossalai.tensor.d_tensor import (
is_distributed_tensor
,
)
from
colossalai.tensor.param_op_hook
import
ColoParamOpHookManager
from
colossalai.utils
import
_cast_float
,
free_storage
,
get_current_device
,
is_ddp_ignored
from
colossalai.utils
import
_cast_float
,
free_storage
,
is_ddp_ignored
from
.chunk
import
Chunk
,
ChunkManager
,
TensorState
,
init_chunk_manager
from
.gemini_hook
import
GeminiZeROHook
...
...
@@ -766,7 +767,7 @@ class GeminiDDP(ModelWrapper):
# move ignored parameters to CUDA
if
is_ddp_ignored
(
p
):
p
.
data
=
p
.
data
.
to
(
device
=
get_current_device
(),
dtype
=
self
.
mixed_precision
)
p
.
data
=
p
.
data
.
to
(
device
=
get_
accelerator
().
get_
current_device
(),
dtype
=
self
.
mixed_precision
)
continue
# create a fp16 parameter
...
...
@@ -815,7 +816,7 @@ class GeminiDDP(ModelWrapper):
for
buffer
in
self
.
module
.
buffers
():
if
isinstance
(
buffer
,
LazyTensor
):
buffer
.
materialize
()
buffer
.
data
=
buffer
.
to
(
get_current_device
())
buffer
.
data
=
buffer
.
to
(
get_
accelerator
().
get_
current_device
())
if
torch
.
is_floating_point
(
buffer
):
buffer
.
data
=
buffer
.
to
(
self
.
mixed_precision
)
...
...
colossalai/zero/gemini/gemini_optimizer.py
View file @
8823cc48
...
...
@@ -11,6 +11,7 @@ from torch.distributed import ProcessGroup
from
torch.nn
import
Parameter
from
torch.optim
import
Optimizer
from
colossalai.accelerator
import
get_accelerator
from
colossalai.amp.naive_amp.mixed_precision_mixin
import
BF16MixedPrecisionMixin
,
FP16MixedPrecisionMixin
from
colossalai.checkpoint_io.utils
import
StateDictSharder
,
gather_distributed_param
from
colossalai.interface
import
OptimizerWrapper
...
...
@@ -26,7 +27,7 @@ from colossalai.tensor.d_tensor import (
is_customized_distributed_tensor
,
is_distributed_tensor
,
)
from
colossalai.utils
import
disposable
,
get_current_device
,
is_ddp_ignored
from
colossalai.utils
import
disposable
,
is_ddp_ignored
from
.chunk
import
Chunk
,
ChunkManager
from
.gemini_ddp
import
GeminiDDP
...
...
@@ -233,7 +234,7 @@ class GeminiOptimizer(OptimizerWrapper):
grad_chunk
.
l2_norm
=
None
# clear l2 norm
comm_buffer
=
torch
.
zeros
(
1
,
dtype
=
torch
.
float
,
device
=
get_current_device
())
comm_buffer
=
torch
.
zeros
(
1
,
dtype
=
torch
.
float
,
device
=
get_
accelerator
().
get_
current_device
())
for
group
,
part_norm
in
group_to_norm
.
items
():
comm_buffer
.
fill_
(
part_norm
)
dist
.
all_reduce
(
comm_buffer
,
group
=
group
)
...
...
@@ -314,10 +315,10 @@ class GeminiOptimizer(OptimizerWrapper):
continue
if
fp32_params_used_cuda_margin_mem
+
chunk32
.
payload_mem
<
fp32_params_available_cuda_margin_mem
:
self
.
chunk_manager
.
move_chunk
(
chunk32
,
get_current_device
())
self
.
chunk_manager
.
move_chunk
(
chunk32
,
get_accelerator
().
get_current_device
())
# stores grad now
self
.
chunk_manager
.
move_chunk
(
chunk16
,
get_current_device
())
self
.
module
.
set_chunk_grad_device
(
chunk16
,
get_current_device
())
self
.
chunk_manager
.
move_chunk
(
chunk16
,
get_accelerator
().
get_current_device
())
self
.
module
.
set_chunk_grad_device
(
chunk16
,
get_accelerator
().
get_current_device
())
fp32_params_used_cuda_margin_mem
+=
chunk32
.
payload_mem
for
group
in
self
.
param_groups
:
...
...
@@ -328,7 +329,7 @@ class GeminiOptimizer(OptimizerWrapper):
state
=
self
.
optim
.
state
[
fake_param
]
for
k
,
v
in
state
.
items
():
if
isinstance
(
v
,
torch
.
Tensor
):
state
[
k
]
=
v
.
to
(
get_current_device
())
state
[
k
]
=
v
.
to
(
get_
accelerator
().
get_
current_device
())
def
_register_states_
(
self
):
for
group
in
self
.
optim
.
param_groups
:
...
...
@@ -551,7 +552,7 @@ class GeminiOptimizer(OptimizerWrapper):
self
,
param_id
:
int
,
state_names
:
list
,
device
:
torch
.
device
=
get_current_device
(),
device
:
torch
.
device
=
get_accelerator
().
get_current_device
(),
dtype
:
torch
.
dtype
=
torch
.
float32
,
)
->
torch
.
Tensor
:
"""
...
...
colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py
View file @
8823cc48
from
typing
import
Optional
from
colossalai.
utils
import
get_
current_device
from
colossalai.
accelerator
import
get_
accelerator
from
colossalai.zero.gemini.chunk
import
ChunkManager
from
.memory_stats
import
MemStats
...
...
@@ -33,4 +33,4 @@ class ChunkMemStatsCollector(MemStatsCollector):
def
cuda_margin_mem
(
self
)
->
float
:
from
colossalai.legacy.utils.memory
import
colo_device_memory_capacity
return
colo_device_memory_capacity
(
get_current_device
())
-
self
.
_memstats
.
max_overall_cuda
return
colo_device_memory_capacity
(
get_
accelerator
().
get_
current_device
())
-
self
.
_memstats
.
max_overall_cuda
colossalai/zero/gemini/memory_tracer/memory_monitor.py
View file @
8823cc48
...
...
@@ -5,7 +5,7 @@ from time import sleep, time
import
torch
from
colossalai.
utils
import
get_
current_device
from
colossalai.
accelerator
import
get_
accelerator
class
MemoryMonitor
:
...
...
@@ -77,7 +77,7 @@ class AsyncMemoryMonitor(MemoryMonitor):
super
().
__init__
()
self
.
keep_measuring
=
False
current_device
=
get_current_device
()
current_device
=
get_accelerator
().
get_current_device
()
def
_set_cuda_device
():
torch
.
cuda
.
set_device
(
current_device
)
...
...
@@ -116,7 +116,7 @@ class AsyncMemoryMonitor(MemoryMonitor):
while
self
.
keep_measuring
:
max_usage
=
max
(
max_usage
,
colo_device_memory_used
(
get_current_device
()),
colo_device_memory_used
(
get_
accelerator
().
get_
current_device
()),
)
sleep
(
self
.
interval
)
return
max_usage
...
...
colossalai/zero/gemini/placement_policy.py
View file @
8823cc48
...
...
@@ -6,8 +6,8 @@ from typing import Dict, List, Optional, Tuple, Type
import
torch
from
colossalai.
utils
import
get_
current_device
from
colossalai.utils.memory
import
colo_device_memory_capacity
from
colossalai.
accelerator
import
get_
accelerator
from
colossalai.
legacy.
utils.memory
import
colo_device_memory_capacity
from
colossalai.zero.gemini.chunk
import
Chunk
from
.chunk
import
Chunk
,
ChunkManager
...
...
@@ -85,7 +85,7 @@ class StaticPlacementPolicy(PlacementPolicy):
# init offload optim settings
# keep gathered chunks are in CUDA
if
chunk
.
keep_gathered
or
offloaded_optim_chunk_mem
>=
offload_optim_chunk_mem
:
device
=
get_current_device
()
device
=
get_accelerator
().
get_current_device
()
else
:
device
=
torch
.
device
(
"cpu"
)
# real offloaded mem is chunk.shard_mem, for simplicity we use chunk mem here
...
...
@@ -140,7 +140,7 @@ class AutoPlacementPolicy(PlacementPolicy):
int: the volume of memory that is evicted
"""
start
=
time
()
cuda_capacity
=
colo_device_memory_capacity
(
get_current_device
())
cuda_capacity
=
colo_device_memory_capacity
(
get_
accelerator
().
get_
current_device
())
used_cuda_model_data
=
self
.
chunk_manager
.
total_mem
[
"cuda"
]
if
warmup
:
# We designate a part of CUDA memory for model data in warmup iterations.
...
...
@@ -194,7 +194,7 @@ class AutoPlacementPolicy(PlacementPolicy):
# init offload optim settings
# keep gathered chunks are in CUDA
if
chunk
.
keep_gathered
:
grads_device_map
[
p
]
=
get_current_device
()
grads_device_map
[
p
]
=
get_accelerator
().
get_current_device
()
else
:
grads_device_map
[
p
]
=
torch
.
device
(
"cpu"
)
...
...
colossalai/zero/gemini/utils.py
View file @
8823cc48
...
...
@@ -6,7 +6,7 @@ import torch
import
torch.distributed
as
dist
import
torch.nn
as
nn
from
colossalai.
utils
import
get_
current_device
from
colossalai.
accelerator
import
get_
accelerator
from
.chunk
import
Chunk
...
...
@@ -18,11 +18,11 @@ def get_temp_total_chunk_on_cuda(chunk: Chunk, dtype: torch.dtype):
if
chunk
.
cuda_shard
is
not
None
:
shard_temp
=
chunk
.
cuda_shard
else
:
shard_temp
=
chunk
.
cpu_shard
.
to
(
get_current_device
())
shard_temp
=
chunk
.
cpu_shard
.
to
(
get_
accelerator
().
get_
current_device
())
shard_temp
=
shard_temp
.
to
(
dtype
)
total_temp
=
torch
.
zeros
(
chunk
.
chunk_size
,
dtype
=
dtype
,
device
=
get_current_device
())
total_temp
=
torch
.
zeros
(
chunk
.
chunk_size
,
dtype
=
dtype
,
device
=
get_
accelerator
().
get_
current_device
())
gather_list
=
list
(
torch
.
chunk
(
input
=
total_temp
,
chunks
=
chunk
.
pg_size
,
dim
=
0
))
dist
.
all_gather
(
tensor_list
=
gather_list
,
tensor
=
shard_temp
,
group
=
chunk
.
torch_pg
)
...
...
colossalai/zero/low_level/low_level_optim.py
View file @
8823cc48
...
...
@@ -12,7 +12,7 @@ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from
torch.distributed
import
ProcessGroup
from
torch.optim
import
Optimizer
import
colossalai.
utils.device
as
device_utils
from
colossalai.
accelerator
import
get_accelerator
from
colossalai.amp.naive_amp.mixed_precision_mixin
import
(
BF16MixedPrecisionMixin
,
FP16MixedPrecisionMixin
,
...
...
@@ -22,9 +22,6 @@ from colossalai.interface import OptimizerWrapper
from
colossalai.logging
import
get_dist_logger
from
colossalai.tensor.moe_tensor.api
import
is_moe_tensor
# from colossalai.tensor import ColoParameter, ProcessGroup
from
colossalai.utils.device
import
IS_NPU_AVAILABLE
,
get_current_device
from
._utils
import
calculate_global_norm_from_list
,
flatten
,
has_inf_or_nan
,
release_param_grad
,
sync_tensor
from
.bookkeeping
import
BucketStore
,
GradientStore
,
ParameterStore
...
...
@@ -183,7 +180,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
# initialize communication stream for
# communication-computation overlapping
if
self
.
_overlap_communication
:
self
.
_comm_stream
=
device_utils
.
Stream
()
self
.
_comm_stream
=
get_accelerator
()
.
Stream
()
# reduction hook is only used if overlapping communication
# or stage 2 is used
...
...
@@ -217,7 +214,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
return
len
(
self
.
_working_param_groups
)
def
_sanity_checks
(
self
):
assert
torch
.
cuda
.
is_available
()
or
IS_NPU_AVAILABLE
,
"device is required"
assert
get_accelerator
().
name
in
[
"cuda"
,
"npu"
]
,
"device is required"
for
param_group
in
self
.
optim
.
param_groups
:
group_params
=
param_group
[
"params"
]
for
param
in
group_params
:
...
...
@@ -228,7 +225,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
def
_create_master_param_current_rank
(
self
,
param_list
):
# split each param evenly by world size
params_current_rank
=
[]
device
=
"cpu"
if
self
.
_cpu_offload
else
get_current_device
()
device
=
"cpu"
if
self
.
_cpu_offload
else
get_accelerator
().
get_current_device
()
for
param
in
param_list
:
padding_size
=
(
self
.
_world_size
-
param
.
numel
()
%
self
.
_world_size
)
%
self
.
_world_size
...
...
@@ -340,11 +337,11 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
if
len
(
moe_grad_list
)
>
0
:
moe_flat_grads
.
record_stream
(
stream
)
# waiting for ops in the default stream finishing
stream
.
wait_stream
(
device_utils
.
current_stream
())
stream
.
wait_stream
(
get_accelerator
()
.
current_stream
())
else
:
stream
=
device_utils
.
current_stream
()
stream
=
get_accelerator
()
.
current_stream
()
with
device_utils
.
stream
(
stream
):
with
get_accelerator
()
.
stream
(
stream
):
group_id
=
self
.
_bucket_store
.
current_group_id
if
self
.
moe_extra_dp_pg
is
None
:
...
...
@@ -486,7 +483,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
# clear reduced grads
if
self
.
_overlap_communication
:
device_utils
.
synchronize
()
get_accelerator
()
.
synchronize
()
self
.
zero_grad
()
...
...
@@ -505,7 +502,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
# clear reduced grads
if
self
.
_overlap_communication
:
device_utils
.
synchronize
()
get_accelerator
()
.
synchronize
()
self
.
zero_grad
()
...
...
@@ -621,7 +618,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
release_param_grad
(
self
.
_master_param_groups_of_current_rank
[
group_id
])
# update working partition updated by the current rank
device
=
get_current_device
()
device
=
get_accelerator
().
get_current_device
()
for
group_id
in
range
(
self
.
num_param_groups
):
master_working_param
=
self
.
optim
.
param_groups
[
group_id
][
"params"
]
for
idx
,
splited_param
in
enumerate
(
master_working_param
):
...
...
@@ -661,7 +658,9 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
norm_type
=
float
(
norm_type
)
if
norm_type
==
inf
:
total_norm
=
max
(
grad
.
data
.
abs
().
max
()
for
grad
in
gradients
)
total_norm_cuda
=
torch
.
tensor
([
float
(
total_norm
)],
device
=
get_current_device
(),
dtype
=
torch
.
float
)
total_norm_cuda
=
torch
.
tensor
(
[
float
(
total_norm
)],
device
=
get_accelerator
().
get_current_device
(),
dtype
=
torch
.
float
)
dist
.
all_reduce
(
total_norm_cuda
,
op
=
torch
.
distributed
.
ReduceOp
.
MAX
,
group
=
self
.
dp_pg
)
total_norm
=
total_norm_cuda
.
item
()
...
...
@@ -673,7 +672,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
# Sum across all model parallel GPUs.
total_norm_exponentiated_cuda
=
torch
.
tensor
(
[
float
(
total_norm_exponentiated
)],
device
=
get_current_device
(),
dtype
=
torch
.
float
[
float
(
total_norm_exponentiated
)],
device
=
get_
accelerator
().
get_
current_device
(),
dtype
=
torch
.
float
)
torch
.
distributed
.
all_reduce
(
total_norm_exponentiated_cuda
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
,
group
=
self
.
dp_pg
...
...
@@ -765,7 +764,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
Dict: the pytorch form state_dict
"""
zero_state
=
dict
()
device
=
get_current_device
()
device
=
get_accelerator
().
get_current_device
()
for
param
,
state
in
self
.
optim
.
state
.
items
():
zero_state
[
param
]
=
copy
.
deepcopy
(
state
)
for
k
,
v
in
state
.
items
():
...
...
@@ -827,7 +826,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
ret_block
=
dict
()
ret_block_size
=
0
device
=
get_current_device
()
device
=
get_accelerator
().
get_current_device
()
local_states
=
self
.
optim
.
state_dict
()[
"state"
]
for
param_idx
,
states
in
local_states
.
items
():
current_block_size
=
0
...
...
docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
View file @
8823cc48
...
...
@@ -45,7 +45,6 @@ from colossalai.booster import Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
HybridParallelPlugin
,
LowLevelZeroPlugin
,
TorchDDPPlugin
from
colossalai.cluster
import
DistCoordinator
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
```
## Define Plugin
Create a
`HybridParallelPlugin`
object and specify the desired parallelism strategies to be used. In this example, both pipeline parallelism and ZeRO-1 are used simultaneously.
...
...
docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
View file @
8823cc48
...
...
@@ -43,7 +43,6 @@ from colossalai.booster import Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
HybridParallelPlugin
,
LowLevelZeroPlugin
,
TorchDDPPlugin
from
colossalai.cluster
import
DistCoordinator
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
```
### 定义plugin
定义一个
[
`HybridParallelPlugin`
](
../basics/booster_plugins.md
)
对象,指定所需要使用的并行策略,在该例子中,同时使用了流水线并行和zero1.
...
...
examples/community/roberta/pretraining/run_pretraining.py
View file @
8823cc48
...
...
@@ -16,10 +16,10 @@ from utils.global_vars import get_tensorboard_writer, get_timers, set_global_var
from
utils.logger
import
Logger
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.context
import
ParallelMode
from
colossalai.nn.parallel
import
zero_model_wrapper
,
zero_optim_wrapper
from
colossalai.tensor
import
ProcessGroup
,
ShardSpec
from
colossalai.utils
import
get_current_device
from
colossalai.utils.model.colo_init_context
import
ColoInitContext
...
...
@@ -53,7 +53,7 @@ def main():
set_global_variables
(
launch_time
,
args
.
tensorboard_path
)
world_size
=
torch
.
distributed
.
get_world_size
()
get_current_device
()
get_accelerator
().
get_current_device
()
# build model, optimizer and criterion
if
args
.
distplan
.
startswith
(
"CAI"
):
...
...
@@ -67,7 +67,10 @@ def main():
# build GPT model
with
ColoInitContext
(
device
=
get_current_device
(),
dtype
=
torch
.
half
,
default_dist_spec
=
default_dist_spec
,
default_pg
=
shard_pg
device
=
get_accelerator
().
get_current_device
(),
dtype
=
torch
.
half
,
default_dist_spec
=
default_dist_spec
,
default_pg
=
shard_pg
,
):
config
,
model
,
numel
=
get_model
(
args
,
logger
)
...
...
@@ -78,7 +81,7 @@ def main():
elif
args
.
distplan
==
"CAI_Gemini"
:
gemini_config
=
dict
(
strict_ddp_mode
=
args
.
tp_degree
==
1
,
device
=
get_current_device
(),
device
=
get_
accelerator
().
get_
current_device
(),
placement_policy
=
args
.
placement
,
pin_memory
=
True
,
hidden_dim
=
model
.
config
.
hidden_size
,
...
...
examples/images/dreambooth/train_dreambooth_colossalai.py
View file @
8823cc48
...
...
@@ -20,11 +20,11 @@ from tqdm.auto import tqdm
from
transformers
import
AutoTokenizer
,
PretrainedConfig
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
LowLevelZeroPlugin
,
TorchDDPPlugin
from
colossalai.logging
import
disable_existing_loggers
,
get_dist_logger
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
disable_existing_loggers
()
logger
=
get_dist_logger
()
...
...
@@ -386,7 +386,7 @@ def main(args):
cur_class_images
=
len
(
list
(
class_images_dir
.
iterdir
()))
if
cur_class_images
<
args
.
num_class_images
:
torch_dtype
=
torch
.
float16
if
get_current_device
()
==
"cuda"
else
torch
.
float32
torch_dtype
=
torch
.
float16
if
get_accelerator
().
get_current_device
()
==
"cuda"
else
torch
.
float32
pipeline
=
DiffusionPipeline
.
from_pretrained
(
args
.
pretrained_model_name_or_path
,
torch_dtype
=
torch_dtype
,
...
...
@@ -401,7 +401,7 @@ def main(args):
sample_dataset
=
PromptDataset
(
args
.
class_prompt
,
num_new_images
)
sample_dataloader
=
torch
.
utils
.
data
.
DataLoader
(
sample_dataset
,
batch_size
=
args
.
sample_batch_size
)
pipeline
.
to
(
get_current_device
())
pipeline
.
to
(
get_
accelerator
().
get_
current_device
())
for
example
in
tqdm
(
sample_dataloader
,
...
...
@@ -578,8 +578,8 @@ def main(args):
# Move text_encode and vae to gpu.
# For mixed precision training we cast the text_encoder and vae weights to half-precision
# as these models are only used for inference, keeping weights in full precision is not required.
vae
.
to
(
get_current_device
(),
dtype
=
weight_dtype
)
text_encoder
.
to
(
get_current_device
(),
dtype
=
weight_dtype
)
vae
.
to
(
get_
accelerator
().
get_
current_device
(),
dtype
=
weight_dtype
)
text_encoder
.
to
(
get_
accelerator
().
get_
current_device
(),
dtype
=
weight_dtype
)
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch
=
math
.
ceil
(
len
(
train_dataloader
))
...
...
@@ -613,7 +613,7 @@ def main(args):
torch
.
cuda
.
reset_peak_memory_stats
()
# Move batch to gpu
for
key
,
value
in
batch
.
items
():
batch
[
key
]
=
value
.
to
(
get_current_device
(),
non_blocking
=
True
)
batch
[
key
]
=
value
.
to
(
get_
accelerator
().
get_
current_device
(),
non_blocking
=
True
)
# Convert images to latent space
optimizer
.
zero_grad
()
...
...
examples/images/dreambooth/train_dreambooth_colossalai_lora.py
View file @
8823cc48
...
...
@@ -21,13 +21,13 @@ from tqdm.auto import tqdm
from
transformers
import
AutoTokenizer
,
PretrainedConfig
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
LowLevelZeroPlugin
,
TorchDDPPlugin
from
colossalai.legacy.context.parallel_mode
import
ParallelMode
from
colossalai.legacy.core
import
global_context
as
gpc
from
colossalai.logging
import
disable_existing_loggers
,
get_dist_logger
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
disable_existing_loggers
()
logger
=
get_dist_logger
()
...
...
@@ -385,7 +385,7 @@ def main(args):
cur_class_images
=
len
(
list
(
class_images_dir
.
iterdir
()))
if
cur_class_images
<
args
.
num_class_images
:
torch_dtype
=
torch
.
float16
if
get_current_device
()
==
"cuda"
else
torch
.
float32
torch_dtype
=
torch
.
float16
if
get_accelerator
().
get_current_device
()
==
"cuda"
else
torch
.
float32
pipeline
=
DiffusionPipeline
.
from_pretrained
(
args
.
pretrained_model_name_or_path
,
torch_dtype
=
torch_dtype
,
...
...
@@ -400,7 +400,7 @@ def main(args):
sample_dataset
=
PromptDataset
(
args
.
class_prompt
,
num_new_images
)
sample_dataloader
=
torch
.
utils
.
data
.
DataLoader
(
sample_dataset
,
batch_size
=
args
.
sample_batch_size
)
pipeline
.
to
(
get_current_device
())
pipeline
.
to
(
get_
accelerator
().
get_
current_device
())
for
example
in
tqdm
(
sample_dataloader
,
...
...
@@ -598,8 +598,8 @@ def main(args):
# Move text_encode and vae to gpu.
# For mixed precision training we cast the text_encoder and vae weights to half-precision
# as these models are only used for inference, keeping weights in full precision is not required.
vae
.
to
(
get_current_device
(),
dtype
=
weight_dtype
)
text_encoder
.
to
(
get_current_device
(),
dtype
=
weight_dtype
)
vae
.
to
(
get_
accelerator
().
get_
current_device
(),
dtype
=
weight_dtype
)
text_encoder
.
to
(
get_
accelerator
().
get_
current_device
(),
dtype
=
weight_dtype
)
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch
=
math
.
ceil
(
len
(
train_dataloader
))
...
...
@@ -633,7 +633,7 @@ def main(args):
torch
.
cuda
.
reset_peak_memory_stats
()
# Move batch to gpu
for
key
,
value
in
batch
.
items
():
batch
[
key
]
=
value
.
to
(
get_current_device
(),
non_blocking
=
True
)
batch
[
key
]
=
value
.
to
(
get_
accelerator
().
get_
current_device
(),
non_blocking
=
True
)
# Convert images to latent space
optimizer
.
zero_grad
()
...
...
examples/images/resnet/train.py
View file @
8823cc48
...
...
@@ -13,12 +13,12 @@ from torch.utils.data import DataLoader
from
tqdm
import
tqdm
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
LowLevelZeroPlugin
,
TorchDDPPlugin
from
colossalai.booster.plugin.dp_plugin_base
import
DPPluginBase
from
colossalai.cluster
import
DistCoordinator
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
# ==============================
# Prepare Hyperparameters
...
...
@@ -53,8 +53,8 @@ def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPl
@
torch
.
no_grad
()
def
evaluate
(
model
:
nn
.
Module
,
test_dataloader
:
DataLoader
,
coordinator
:
DistCoordinator
)
->
float
:
model
.
eval
()
correct
=
torch
.
zeros
(
1
,
dtype
=
torch
.
int64
,
device
=
get_current_device
())
total
=
torch
.
zeros
(
1
,
dtype
=
torch
.
int64
,
device
=
get_current_device
())
correct
=
torch
.
zeros
(
1
,
dtype
=
torch
.
int64
,
device
=
get_
accelerator
().
get_
current_device
())
total
=
torch
.
zeros
(
1
,
dtype
=
torch
.
int64
,
device
=
get_
accelerator
().
get_
current_device
())
for
images
,
labels
in
test_dataloader
:
images
=
images
.
cuda
()
labels
=
labels
.
cuda
()
...
...
examples/images/vit/vit_benchmark.py
View file @
8823cc48
...
...
@@ -33,9 +33,10 @@ def get_data_batch(batch_size, num_labels, num_channels=3, height=224, width=224
def
colo_memory_cap
(
size_in_GB
):
from
colossalai.utils
import
colo_device_memory_capacity
,
colo_set_process_memory_fraction
,
get_current_device
from
colossalai.accelerator
import
get_accelerator
from
colossalai.utils
import
colo_device_memory_capacity
,
colo_set_process_memory_fraction
cuda_capacity
=
colo_device_memory_capacity
(
get_current_device
())
cuda_capacity
=
colo_device_memory_capacity
(
get_
accelerator
().
get_
current_device
())
if
size_in_GB
*
(
1024
**
3
)
<
cuda_capacity
:
colo_set_process_memory_fraction
(
size_in_GB
*
(
1024
**
3
)
/
cuda_capacity
)
print
(
f
"Limiting GPU memory usage to
{
size_in_GB
}
GB"
)
...
...
examples/inference/benchmark_llama.py
View file @
8823cc48
...
...
@@ -6,10 +6,9 @@ import torch.distributed as dist
import
transformers
import
colossalai
import
colossalai.
utils.device
as
device_utils
from
colossalai.
accelerator
import
get_accelerator
from
colossalai.inference
import
InferenceEngine
from
colossalai.testing
import
clear_cache_before_run
,
rerun_if_address_is_in_use
,
spawn
from
colossalai.utils.device
import
get_current_device
GIGABYTE
=
1024
**
3
MEGABYTE
=
1024
*
1024
...
...
@@ -52,7 +51,7 @@ CONFIG_MAP = {
def
data_gen
(
batch_size
:
int
=
4
,
seq_len
:
int
=
512
):
input_ids
=
torch
.
randint
(
10
,
30000
,
(
batch_size
,
seq_len
),
device
=
get_current_device
())
input_ids
=
torch
.
randint
(
10
,
30000
,
(
batch_size
,
seq_len
),
device
=
get_
accelerator
().
get_
current_device
())
attention_mask
=
torch
.
ones_like
(
input_ids
)
data
=
dict
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
)
return
data
...
...
@@ -97,9 +96,9 @@ def print_details_info(outputs, model_config, args, whole_end2end):
msg
+=
f
"Flops:
{
num_parameters
*
num_bytes
/
whole_avg_latency
/
1e12
:.
2
f
}
TFLOPS
\n
"
if
torch
.
cuda
.
is_available
():
msg
+=
f
"-------Memory Summary Device:
{
device_utils
.
current_device
()
}
-------
\n
"
msg
+=
f
"Max memory allocated:
{
device_utils
.
max_memory_allocated
()
/
GIGABYTE
:.
2
f
}
GB
\n
"
msg
+=
f
"Max memory reserved:
{
device_utils
.
max_memory_reserved
()
/
GIGABYTE
:.
2
f
}
GB
\n
"
msg
+=
f
"-------Memory Summary Device:
{
get_accelerator
()
.
current_device
()
}
-------
\n
"
msg
+=
f
"Max memory allocated:
{
get_accelerator
()
.
max_memory_allocated
()
/
GIGABYTE
:.
2
f
}
GB
\n
"
msg
+=
f
"Max memory reserved:
{
get_accelerator
()
.
max_memory_reserved
()
/
GIGABYTE
:.
2
f
}
GB
\n
"
print
(
msg
)
...
...
examples/inference/run_llama_inference.py
View file @
8823cc48
...
...
@@ -5,9 +5,9 @@ import torch.distributed as dist
from
transformers
import
LlamaForCausalLM
,
LlamaTokenizer
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.inference
import
InferenceEngine
from
colossalai.testing
import
spawn
from
colossalai.utils.device
import
get_current_device
INPUT_TEXTS
=
[
"What is the longest river in the world?"
,
...
...
@@ -57,7 +57,7 @@ def run_inference(args):
)
inputs
=
tokenizer
(
INPUT_TEXTS
,
return_tensors
=
"pt"
,
padding
=
"longest"
,
max_length
=
max_input_len
,
truncation
=
True
)
inputs
=
{
k
:
v
.
to
(
get_current_device
())
for
k
,
v
in
inputs
.
items
()}
inputs
=
{
k
:
v
.
to
(
get_
accelerator
().
get_
current_device
())
for
k
,
v
in
inputs
.
items
()}
outputs
=
engine
.
generate
(
inputs
)
if
rank
==
0
:
...
...
examples/language/bert/finetune.py
View file @
8823cc48
...
...
@@ -18,11 +18,11 @@ from transformers import (
)
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
HybridParallelPlugin
,
LowLevelZeroPlugin
,
TorchDDPPlugin
from
colossalai.cluster
import
DistCoordinator
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
# ==============================
# Prepare Hyperparameters
...
...
@@ -59,7 +59,7 @@ def evaluate_model(
use_pipeline
=
isinstance
(
booster
.
plugin
,
HybridParallelPlugin
)
and
booster
.
plugin
.
pp_size
>
1
is_pp_last_device
=
use_pipeline
and
booster
.
plugin
.
stage_manager
.
is_last_stage
(
ignore_chunk
=
True
)
accum_loss
=
torch
.
zeros
(
1
,
device
=
get_current_device
())
accum_loss
=
torch
.
zeros
(
1
,
device
=
get_
accelerator
().
get_
current_device
())
for
batch
in
dataloader
:
batch
=
move_to_cuda
(
batch
)
labels
=
batch
[
"labels"
]
...
...
@@ -89,8 +89,10 @@ def evaluate_model(
object_list
=
[
None
,
None
]
dist
.
broadcast_object_list
(
object_list
,
src
=
current_pp_group_ranks
[
-
1
],
group
=
pp_group
)
metric
.
add_batch
(
predictions
=
object_list
[
0
].
to
(
get_current_device
()),
references
=
labels
)
accum_loss
.
add_
(
object_list
[
1
].
to
(
get_current_device
()))
metric
.
add_batch
(
predictions
=
object_list
[
0
].
to
(
get_accelerator
().
get_current_device
()),
references
=
labels
)
accum_loss
.
add_
(
object_list
[
1
].
to
(
get_accelerator
().
get_current_device
()))
else
:
batch
=
move_to_cuda
(
batch
)
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment