Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
c622bb36
Unverified
Commit
c622bb36
authored
Jun 07, 2023
by
Frank Lee
Committed by
GitHub
Jun 07, 2023
Browse files
Merge pull request #3915 from FrankLeeeee/update/develop
[sync] update develop with main
parents
34966378
9c88b6cb
Changes
174
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
278 additions
and
33 deletions
+278
-33
applications/Chat/examples/ray/test_ci.sh
applications/Chat/examples/ray/test_ci.sh
+12
-0
applications/Chat/examples/test_ci.sh
applications/Chat/examples/test_ci.sh
+3
-0
colossalai/amp/naive_amp/mixed_precision_mixin/__init__.py
colossalai/amp/naive_amp/mixed_precision_mixin/__init__.py
+9
-0
colossalai/amp/naive_amp/mixed_precision_mixin/base.py
colossalai/amp/naive_amp/mixed_precision_mixin/base.py
+91
-0
colossalai/amp/naive_amp/mixed_precision_mixin/bf16.py
colossalai/amp/naive_amp/mixed_precision_mixin/bf16.py
+23
-0
colossalai/amp/naive_amp/mixed_precision_mixin/fp16.py
colossalai/amp/naive_amp/mixed_precision_mixin/fp16.py
+84
-0
colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
...auto_parallel/tensor_shard/node_handler/matmul_handler.py
+1
-1
colossalai/auto_parallel/tensor_shard/utils/broadcast.py
colossalai/auto_parallel/tensor_shard/utils/broadcast.py
+6
-6
colossalai/booster/booster.py
colossalai/booster/booster.py
+2
-2
colossalai/booster/plugin/gemini_plugin.py
colossalai/booster/plugin/gemini_plugin.py
+8
-1
colossalai/booster/plugin/low_level_zero_plugin.py
colossalai/booster/plugin/low_level_zero_plugin.py
+22
-11
colossalai/booster/plugin/torch_fsdp_plugin.py
colossalai/booster/plugin/torch_fsdp_plugin.py
+6
-1
colossalai/cli/launcher/__init__.py
colossalai/cli/launcher/__init__.py
+1
-1
colossalai/cli/launcher/hostinfo.py
colossalai/cli/launcher/hostinfo.py
+1
-1
colossalai/cli/launcher/multinode_runner.py
colossalai/cli/launcher/multinode_runner.py
+1
-1
colossalai/cli/launcher/run.py
colossalai/cli/launcher/run.py
+2
-2
colossalai/device/alpha_beta_profiler.py
colossalai/device/alpha_beta_profiler.py
+1
-1
colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py
...atch/patched_bias_addition_module/bias_addition_module.py
+1
-1
colossalai/fx/tracer/experimental.py
colossalai/fx/tracer/experimental.py
+1
-1
colossalai/fx/tracer/tracer.py
colossalai/fx/tracer/tracer.py
+3
-3
No files found.
applications/Chat/examples/ray/test_ci.sh
0 → 100755
View file @
c622bb36
#!/bin/bash
set
-xe
BASE
=
$(
realpath
$(
dirname
$0
))
export
RAY_NAMESPACE
=
admin
export
DATA
=
/data/scratch/chatgpt/prompts.csv
# install requirements
pip
install
-r
${
BASE
}
/requirements.txt
python
${
BASE
}
/mmmt_prompt.py
--prompt_path
$DATA
--num_makers
2
--num_trainers
2
--trainer_strategy
colossalai_gemini
--model
opt
--critic_model
opt
--pretrain
facebook/opt-350m
--critic_pretrain
facebook/opt-125m
--experience_batch_size
4
--train_batch_size
2
applications/Chat/examples/test_ci.sh
View file @
c622bb36
...
...
@@ -124,3 +124,6 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_datas
rm
-rf
${
BASE
}
/rm_ckpt_gpt.pt
rm
-rf
${
BASE
}
/actor_checkpoint_prompts.pt
# 3080 doesn't support P2P, skip this test
# cd ${BASE}/ray && bash test_ci.sh && cd ${BASE}
colossalai/amp/naive_amp/mixed_precision_mixin/__init__.py
0 → 100644
View file @
c622bb36
from
.base
import
MixedPrecisionMixin
from
.bf16
import
BF16MixedPrecisionMixin
from
.fp16
import
FP16MixedPrecisionMixin
__all__
=
[
'MixedPrecisionMixin'
,
'FP16MixedPrecisionMixin'
,
'BF16MixedPrecisionMixin'
,
]
colossalai/amp/naive_amp/mixed_precision_mixin/base.py
0 → 100644
View file @
c622bb36
from
abc
import
ABC
,
abstractmethod
import
torch
from
torch
import
Tensor
class
MixedPrecisionMixin
(
ABC
):
"""A helper class for mixed precision training. This mixin is used in mixed precision optimizers.
Attributes:
dtype (torc.dtype): The expected dtype of the gradients.
Examples:
```python
class MyMixedPrecisionOptimizer(OptimizerWrapper):
def __init__(self, optim: Optimizer):
super().__init__(optim)
self.mixed_precision = MixedPrecisionMixin()
def backward(self, loss):
loss = self.mixed_precision.pre_backward(loss)
loss.backward()
def backward_by_grad(self, tensor, grad):
grad = self.mixed_precision.pre_backward_by_grad(tensor, grad)
tensor.backward(grad)
def step(self):
if self.mixed_precision.should_skip_step():
self.zero_grad()
return
div_scale = self.mixed_precision.get_grad_div_scale()
# maybe clip grad here
# maybe scale grad here
self.optim.step()
def zero_grad(self):
self.mixed_precision.pre_zero_grad()
return self.optim.zero_grad()
```
"""
dtype
:
torch
.
dtype
@
abstractmethod
def
pre_backward
(
self
,
loss
:
Tensor
)
->
Tensor
:
"""Called before backward.
Args:
loss (Tensor): Loss value.
Returns:
Tensor: Loss value (possibly scaled).
"""
pass
@
abstractmethod
def
pre_backward_by_grad
(
self
,
tensor
:
Tensor
,
grad
:
Tensor
)
->
Tensor
:
"""Called before backward by grad. This is helpful for pipeline parallelism.
Args:
tensor (Tensor): Tensor to backward.
grad (Tensor): Gradient of the tensor.
Returns:
Tensor: Gradient of the tensor (possibly scaled).
"""
pass
@
abstractmethod
def
should_skip_step
(
self
)
->
bool
:
"""Called before step.
Returns:
bool: Whether to skip the step.
"""
pass
@
abstractmethod
def
pre_zero_grad
(
self
)
->
None
:
"""Called before zero_grad.
"""
pass
@
abstractmethod
def
get_grad_div_scale
(
self
)
->
float
:
"""Called before step or clip_grad. To keep computation efficiency, this method does not (maybe) unscale grads.
Returns:
float: A divisor for gradient clipping or step.
"""
pass
colossalai/amp/naive_amp/mixed_precision_mixin/bf16.py
0 → 100644
View file @
c622bb36
import
torch
from
torch
import
Tensor
from
.base
import
MixedPrecisionMixin
class
BF16MixedPrecisionMixin
(
MixedPrecisionMixin
):
dtype
=
torch
.
bfloat16
def
pre_backward
(
self
,
loss
:
Tensor
)
->
Tensor
:
return
loss
def
pre_backward_by_grad
(
self
,
tensor
:
Tensor
,
grad
:
Tensor
)
->
Tensor
:
return
grad
def
should_skip_step
(
self
)
->
bool
:
return
False
def
pre_zero_grad
(
self
)
->
None
:
pass
def
get_grad_div_scale
(
self
)
->
float
:
return
1.0
colossalai/amp/naive_amp/mixed_precision_mixin/fp16.py
0 → 100644
View file @
c622bb36
from
abc
import
abstractmethod
from
enum
import
Enum
import
torch
import
torch.distributed
as
dist
from
torch
import
Tensor
from
colossalai.amp.naive_amp.grad_scaler
import
DynamicGradScaler
from
colossalai.utils
import
get_current_device
from
.base
import
MixedPrecisionMixin
class
OptimState
(
Enum
):
SCALED
=
0
UNSCALED
=
1
class
FP16MixedPrecisionMixin
(
MixedPrecisionMixin
):
dtype
=
torch
.
float16
def
__init__
(
self
,
initial_scale
:
float
=
2
**
16
,
min_scale
:
float
=
1
,
growth_factor
:
float
=
2
,
backoff_factor
:
float
=
0.5
,
growth_interval
:
int
=
1000
,
hysteresis
:
int
=
2
,
max_scale
:
float
=
2
**
32
)
->
None
:
super
().
__init__
()
self
.
grad_scaler
=
DynamicGradScaler
(
initial_scale
=
initial_scale
,
min_scale
=
min_scale
,
growth_factor
=
growth_factor
,
backoff_factor
=
backoff_factor
,
growth_interval
=
growth_interval
,
hysteresis
=
hysteresis
,
max_scale
=
max_scale
)
self
.
optim_state
=
OptimState
.
UNSCALED
self
.
found_overflow
=
torch
.
zeros
(
1
,
dtype
=
torch
.
float
,
device
=
get_current_device
())
@
property
def
loss_scale
(
self
)
->
float
:
return
self
.
grad_scaler
.
scale
.
item
()
@
abstractmethod
def
check_local_overflow
(
self
)
->
bool
:
"""Check whether there is overflow in the local process. This method should be implemented by subclasses.
Returns:
bool: Whether there is overflow in the local process.
"""
pass
def
check_overflow
(
self
)
->
bool
:
# clear previous overflow record
self
.
found_overflow
.
fill_
(
0.0
)
if
self
.
check_local_overflow
():
self
.
found_overflow
.
fill_
(
1.0
)
dist
.
all_reduce
(
self
.
found_overflow
,
op
=
dist
.
ReduceOp
.
MAX
)
return
self
.
found_overflow
.
item
()
>
0
def
pre_backward
(
self
,
loss
:
Tensor
)
->
Tensor
:
loss
=
self
.
loss_scale
*
loss
self
.
optim_state
=
OptimState
.
SCALED
return
loss
def
pre_backward_by_grad
(
self
,
tensor
:
Tensor
,
grad
:
Tensor
)
->
Tensor
:
self
.
optim_state
=
OptimState
.
SCALED
return
grad
def
should_skip_step
(
self
)
->
bool
:
found_inf
=
self
.
check_overflow
()
self
.
grad_scaler
.
update
(
found_inf
)
if
found_inf
:
self
.
optim_state
=
OptimState
.
UNSCALED
return
found_inf
def
pre_zero_grad
(
self
)
->
None
:
pass
def
get_grad_div_scale
(
self
)
->
float
:
assert
self
.
optim_state
==
OptimState
.
SCALED
,
'grads should be scaled before clipping'
self
.
optim_state
=
OptimState
.
UNSCALED
return
self
.
loss_scale
colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
View file @
c622bb36
...
...
@@ -206,7 +206,7 @@ class Broadcaster(BmmTransform):
# e.g. [1, 2, 4] x [4, 4, 8] -> [4, 2, 8]
# the dim 0 of [1, 2, 4] is multiplied to 4
tensor_shape
[
dim_idx
]
=
1
elif
broadcast_type
==
BroadcastType
.
PADD
D
ING
:
elif
broadcast_type
==
BroadcastType
.
PADDING
:
# if the dim is padded
# we remove its sharding
tensor_shape
[
dim_idx
]
=
None
...
...
colossalai/auto_parallel/tensor_shard/utils/broadcast.py
View file @
c622bb36
...
...
@@ -21,7 +21,7 @@ __all__ = [
class
BroadcastType
(
Enum
):
EQUAL
=
auto
()
PADD
D
ING
=
auto
()
PADDING
=
auto
()
MULTIPLE
=
auto
()
...
...
@@ -69,18 +69,18 @@ def get_broadcast_dim_info(logical_shape, physical_shape):
for
i
in
range
(
logical_num_dims
):
# get the trailing dim size
logical_dim_idx
=
logical_num_dims
-
i
-
1
phy
i
scal_dim_idx
=
physical_num_dims
-
i
-
1
phys
i
cal_dim_idx
=
physical_num_dims
-
i
-
1
logical_dim_size
=
logical_shape
[
logical_dim_idx
]
if
phy
i
scal_dim_idx
>=
0
:
physical_dim_size
=
physical_shape
[
phy
i
scal_dim_idx
]
if
phys
i
cal_dim_idx
>=
0
:
physical_dim_size
=
physical_shape
[
phys
i
cal_dim_idx
]
if
physical_dim_size
==
logical_dim_size
:
logical_dim_broadcast_info
[
logical_dim_idx
]
=
BroadcastType
.
EQUAL
elif
physical_dim_size
==
1
and
physical_dim_size
!=
logical_dim_size
:
logical_dim_broadcast_info
[
logical_dim_idx
]
=
BroadcastType
.
MULTIPLE
else
:
logical_dim_broadcast_info
[
logical_dim_idx
]
=
BroadcastType
.
PADD
D
ING
logical_dim_broadcast_info
[
logical_dim_idx
]
=
BroadcastType
.
PADDING
return
logical_dim_broadcast_info
...
...
@@ -117,7 +117,7 @@ def recover_sharding_spec_for_broadcast_shape(logical_sharding_spec: ShardingSpe
for
shape_dim
,
mesh_dim
in
logical_dim_partition
.
items
():
logical_broadcast_type
=
logical_dim_broadcast_info
[
shape_dim
]
if
logical_broadcast_type
==
BroadcastType
.
PADD
D
ING
or
logical_broadcast_type
==
BroadcastType
.
MULTIPLE
:
if
logical_broadcast_type
==
BroadcastType
.
PADDING
or
logical_broadcast_type
==
BroadcastType
.
MULTIPLE
:
removed_dims
.
extend
(
mesh_dim
)
else
:
# get the corresponding physical dim
...
...
colossalai/booster/booster.py
View file @
c622bb36
...
...
@@ -25,11 +25,11 @@ class Booster:
Examples:
```python
colossalai.launch(...)
plugin = GeminiPlugin(
stage=3,
...)
plugin = GeminiPlugin(...)
booster = Booster(precision='fp16', plugin=plugin)
model = GPT2()
optimizer = Adam(model.parameters())
optimizer =
Hybrid
Adam(model.parameters())
dataloader = Dataloader(Dataset)
lr_scheduler = LinearWarmupScheduler()
criterion = GPTLMLoss()
...
...
colossalai/booster/plugin/gemini_plugin.py
View file @
c622bb36
...
...
@@ -23,6 +23,9 @@ from .dp_plugin_base import DPPluginBase
__all__
=
[
'GeminiPlugin'
]
SUPPORTED_PRECISION
=
[
'fp16'
,
'bf16'
]
PRECISION_STR_TO_DTYPE
=
{
'fp16'
:
torch
.
half
,
'bf16'
:
torch
.
bfloat16
}
class
GeminiCheckpointIO
(
GeneralCheckpointIO
):
...
...
@@ -171,6 +174,7 @@ class GeminiPlugin(DPPluginBase):
Args:
device (torch.device): device to place the model.
placement_policy (str, optional): "cpu", "cuda", "auto". Defaults to "cpu".
precision (str, optional): precision. Support 'fp16' and 'bf16'. Defaults to 'fp16'.
pin_memory (bool, optional): use pin memory on CPU. Defaults to False.
force_outputs_fp32 (bool, optional): force outputs are fp32. Defaults to False.
strict_ddp_mode (bool, optional): use strict ddp mode (only use dp without other parallelism). Defaults to False.
...
...
@@ -203,6 +207,7 @@ class GeminiPlugin(DPPluginBase):
self
,
device
:
Optional
[
torch
.
device
]
=
None
,
placement_policy
:
str
=
"cpu"
,
precision
:
str
=
"fp16"
,
pin_memory
:
bool
=
False
,
force_outputs_fp32
:
bool
=
False
,
strict_ddp_mode
:
bool
=
False
,
...
...
@@ -223,6 +228,7 @@ class GeminiPlugin(DPPluginBase):
verbose
:
bool
=
False
,
)
->
None
:
super
().
__init__
()
assert
precision
in
SUPPORTED_PRECISION
,
f
'precision
{
precision
}
is not supported'
self
.
gemini_config
=
dict
(
device
=
(
device
or
get_current_device
()),
placement_policy
=
placement_policy
,
...
...
@@ -233,6 +239,7 @@ class GeminiPlugin(DPPluginBase):
hidden_dim
=
hidden_dim
,
min_chunk_size_mb
=
min_chunk_size_mb
,
memstats
=
memstats
,
mixed_precision
=
PRECISION_STR_TO_DTYPE
[
precision
],
)
self
.
zero_optim_config
=
dict
(
gpu_margin_mem_ratio
=
gpu_margin_mem_ratio
,)
self
.
optim_kwargs
=
dict
(
initial_scale
=
initial_scale
,
...
...
@@ -253,7 +260,7 @@ class GeminiPlugin(DPPluginBase):
return
True
def
supported_precisions
(
self
)
->
List
[
str
]:
return
[
'fp16'
]
return
SUPPORTED_PRECISION
def
control_device
(
self
)
->
bool
:
return
True
...
...
colossalai/booster/plugin/low_level_zero_plugin.py
View file @
c622bb36
import
warnings
from
functools
import
partial
from
typing
import
Callable
,
Iterator
,
List
,
Optional
,
Tuple
,
Union
import
torch
...
...
@@ -20,12 +21,15 @@ from .torch_ddp_plugin import TorchDDPCheckpointIO
__all__
=
[
'LowLevelZeroPlugin'
]
def
_convert_
to_fp16
(
x
):
def
_convert_
floating_point
(
x
,
dtype
:
torch
.
dtype
=
torch
.
float16
):
if
isinstance
(
x
,
torch
.
Tensor
)
and
torch
.
is_floating_point
(
x
):
return
x
.
half
(
)
return
x
.
to
(
dtype
)
return
x
SUPPORTED_PRECISION
=
[
'fp16'
,
'bf16'
,
'fp32'
]
class
LowLevelZeroCheckpointIO
(
TorchDDPCheckpointIO
):
def
save_unsharded_optimizer
(
self
,
optimizer
:
Optimizer
,
checkpoint
:
str
,
gather_dtensor
:
bool
):
...
...
@@ -49,17 +53,24 @@ class LowLevelZeroModel(ModelWrapper):
def
__init__
(
self
,
module
:
nn
.
Module
,
stage
:
int
,
precision
:
str
)
->
None
:
super
().
__init__
(
module
)
self
.
convert_inputs
=
(
precision
==
'fp16'
)
module
=
zero_model_wrapper
(
module
,
zero_stage
=
stage
)
self
.
dtype
=
None
if
precision
==
'fp16'
:
module
=
module
.
half
()
self
.
dtype
=
torch
.
float16
elif
precision
==
'bf16'
:
self
.
dtype
=
torch
.
bfloat16
module
=
zero_model_wrapper
(
module
,
zero_stage
=
stage
)
if
self
.
dtype
is
not
None
:
module
=
module
.
to
(
self
.
dtype
)
module
=
module
.
to
(
get_current_device
())
self
.
module
=
module
self
.
convert_fn
=
None
if
self
.
dtype
is
not
None
:
self
.
convert_fn
=
partial
(
_convert_floating_point
,
dtype
=
self
.
dtype
)
def
forward
(
self
,
*
args
,
**
kwargs
):
if
self
.
convert_
inputs
:
args
=
tree_map
(
_
convert_
to_fp16
,
args
)
kwargs
=
tree_map
(
_
convert_
to_fp16
,
kwargs
)
if
self
.
convert_
fn
is
not
None
:
args
=
tree_map
(
self
.
convert_
fn
,
args
)
kwargs
=
tree_map
(
self
.
convert_
fn
,
kwargs
)
return
super
().
forward
(
*
args
,
**
kwargs
)
...
...
@@ -110,7 +121,7 @@ class LowLevelZeroPlugin(DPPluginBase):
Args:
strage (int, optional): ZeRO stage. Defaults to 1.
precision (str, optional): precision. Support 'fp16' and 'fp32'. Defaults to 'fp16'.
precision (str, optional): precision. Support 'fp16'
, 'bf16'
and 'fp32'. Defaults to 'fp16'.
initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
...
...
@@ -149,7 +160,7 @@ class LowLevelZeroPlugin(DPPluginBase):
)
->
None
:
super
().
__init__
()
assert
stage
in
(
1
,
2
),
f
'LowLevelZeroPlugin only supports stage 1/2 training'
assert
precision
in
(
'fp16'
,
'fp32'
)
,
f
'LowLevelZeroPlugin only supports
fp16/fp32
training'
assert
precision
in
SUPPORTED_PRECISION
,
f
'LowLevelZeroPlugin only supports
{
SUPPORTED_PRECISION
}
training'
self
.
stage
=
stage
self
.
precision
=
precision
...
...
@@ -175,7 +186,7 @@ class LowLevelZeroPlugin(DPPluginBase):
return
True
def
supported_precisions
(
self
)
->
List
[
str
]:
return
[
'fp16'
,
'fp32'
]
return
SUPPORTED_PRECISION
def
control_device
(
self
)
->
bool
:
return
True
...
...
colossalai/booster/plugin/torch_fsdp_plugin.py
View file @
c622bb36
...
...
@@ -3,10 +3,10 @@ from typing import Callable, Iterable, Iterator, List, Optional, Tuple, Union
import
torch
import
torch.nn
as
nn
import
warnings
from
packaging
import
version
from
torch.distributed
import
ProcessGroup
if
version
.
parse
(
torch
.
__version__
)
>=
version
.
parse
(
'1.12.0'
):
from
torch.distributed.fsdp
import
FullStateDictConfig
from
torch.distributed.fsdp
import
FullyShardedDataParallel
as
FSDP
...
...
@@ -202,6 +202,11 @@ class TorchFSDPPlugin(DPPluginBase):
# wrap the model with PyTorch FSDP
fsdp_model
=
TorchFSDPModel
(
model
,
device_id
=
torch
.
cuda
.
current_device
(),
**
self
.
fsdp_kwargs
)
if
len
(
optimizer
.
param_groups
)
>
1
:
warnings
.
warn
(
'TorchFSDPPlugin does not support optimizer that use multi param groups. The results may not be as expected if used.'
)
optimizer
.
__init__
(
fsdp_model
.
parameters
(),
**
optimizer
.
defaults
)
if
not
isinstance
(
optimizer
,
FSDPOptimizerWrapper
):
...
...
colossalai/cli/launcher/__init__.py
View file @
c622bb36
...
...
@@ -28,7 +28,7 @@ from .run import launch_multi_processes
type
=
str
,
default
=
None
,
help
=
"Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --includ,"
"Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --includ
e
,"
" only effective when used with --hostfile."
)
@
click
.
option
(
"--num_nodes"
,
type
=
int
,
...
...
colossalai/cli/launcher/hostinfo.py
View file @
c622bb36
...
...
@@ -38,7 +38,7 @@ class HostInfo:
# socket.getfqdn("127.0.0.1") does not return localhost
# on some users' machines
# thus, we directly return True if hostname is locahost, 127.0.0.1 or 0.0.0.0
# thus, we directly return True if hostname is loca
l
host, 127.0.0.1 or 0.0.0.0
if
hostname
in
(
"localhost"
,
"127.0.0.1"
,
"0.0.0.0"
):
return
True
...
...
colossalai/cli/launcher/multinode_runner.py
View file @
c622bb36
...
...
@@ -114,7 +114,7 @@ class MultiNodeRunner:
Receive messages from all hosts
Returns:
msg_from_node (dict): a dictionry which contains messages from each node
msg_from_node (dict): a diction
a
ry which contains messages from each node
"""
msg_from_node
=
dict
()
...
...
colossalai/cli/launcher/run.py
View file @
c622bb36
...
...
@@ -154,7 +154,7 @@ def get_launch_command(
extra_launch_args
=
dict
()
torch_version
=
version
.
parse
(
torch
.
__version__
)
assert
torch_version
.
major
=
=
1
assert
torch_version
.
major
>
=
1
if
torch_version
.
minor
<
9
:
cmd
=
[
...
...
@@ -298,7 +298,7 @@ def launch_multi_processes(args: Config) -> None:
# receive the stop status
msg_from_node
=
runner
.
recv_from_all
()
# print
e
node status
# print node status
click
.
echo
(
"
\n
====== Stopping All Nodes ====="
)
for
hostname
,
msg
in
msg_from_node
.
items
():
click
.
echo
(
f
"
{
hostname
}
:
{
msg
}
"
)
...
...
colossalai/device/alpha_beta_profiler.py
View file @
c622bb36
...
...
@@ -197,7 +197,7 @@ class AlphaBetaProfiler:
dist
.
broadcast_object_list
(
broadcast_list
,
src
=
process_group
[
0
])
alpha_beta_dict
[
process_group
]
=
tuple
(
broadcast_list
)
# add symmetry pair to the apha_beta_dict
# add symmetry pair to the a
l
pha_beta_dict
symmetry_ab_dict
=
{}
for
process_group
,
alpha_beta_pair
in
alpha_beta_dict
.
items
():
symmetry_process_group
=
(
process_group
[
1
],
process_group
[
0
])
...
...
colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py
View file @
c622bb36
...
...
@@ -51,7 +51,7 @@ class BiasAdditionModule(ABC):
For example:
The kwargs for conv2d module is {} because the attributes like 'padding' or 'groups' are
considered during module initilizing. However, we need to consider those attributes as kwargs
considered during module initi
a
lizing. However, we need to consider those attributes as kwargs
in F.conv2d.
"""
pass
...
...
colossalai/fx/tracer/experimental.py
View file @
c622bb36
...
...
@@ -295,7 +295,7 @@ class ColoTracer(Tracer):
@
staticmethod
def
forward
(
ctx
,
run_function
,
preserve_rng_state
,
*
args
):
# signal that the current tracing occurs within activaton checkpoint part
# signal that the current tracing occurs within activat
i
on checkpoint part
self
.
inside_torch_checkpoint_func
=
True
out
=
run_function
(
*
args
)
self
.
inside_torch_checkpoint_func
=
False
...
...
colossalai/fx/tracer/tracer.py
View file @
c622bb36
...
...
@@ -92,7 +92,7 @@ class ColoTracer(Tracer):
return
proxy
# if graph is traced for auto parallelism module, some extra node will be added during
# graph construction to deal with the compat
a
bility between bias addition and all reduce.
# graph construction to deal with the compat
i
bility between bias addition and all reduce.
# if no extra manipulation is applied, we just pass the origin arguments to create_proxy function
# to create node on computation graph
...
...
@@ -208,7 +208,7 @@ class ColoTracer(Tracer):
self
.
proxy_cls
=
ColoProxy
self
.
tracer_type
=
TracerType
.
META
else
:
raise
ValueError
(
f
"Unrecogni
s
ed tracer type
{
tracer_type
}
"
)
raise
ValueError
(
f
"Unrecogni
z
ed tracer type
{
tracer_type
}
"
)
def
_meta_data_computing
(
self
,
kind
,
target
,
args
,
kwargs
):
...
...
@@ -445,7 +445,7 @@ class ColoTracer(Tracer):
@
staticmethod
def
forward
(
ctx
,
run_function
,
preserve_rng_state
,
*
args
):
# signal that the current tracing occurs within activaton checkpoint part
# signal that the current tracing occurs within activat
i
on checkpoint part
self
.
inside_torch_checkpoint_func
=
True
out
=
run_function
(
*
args
)
self
.
inside_torch_checkpoint_func
=
False
...
...
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment