Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
53cb5848
Unverified
Commit
53cb5848
authored
Apr 12, 2022
by
Jiarui Fang
Committed by
GitHub
Apr 12, 2022
Browse files
[utils] correct cpu memory used and capacity in the context of multi-process (#726)
parent
7db3ccc7
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
52 additions
and
20 deletions
+52
-20
colossalai/utils/memory.py
colossalai/utils/memory.py
+10
-3
colossalai/utils/memory_tracer/async_memtracer.py
colossalai/utils/memory_tracer/async_memtracer.py
+1
-0
tests/test_moe/test_moe_zero_init.py
tests/test_moe/test_moe_zero_init.py
+1
-1
tests/test_moe/test_moe_zero_model.py
tests/test_moe/test_moe_zero_model.py
+1
-1
tests/test_moe/test_moe_zero_optim.py
tests/test_moe/test_moe_zero_optim.py
+1
-1
tests/test_utils/test_memory.py
tests/test_utils/test_memory.py
+32
-0
tests/test_zero/common.py
tests/test_zero/common.py
+0
-0
tests/test_zero/test_found_inf.py
tests/test_zero/test_found_inf.py
+1
-1
tests/test_zero/test_init_context.py
tests/test_zero/test_init_context.py
+0
-0
tests/test_zero/test_shard_model_v2.py
tests/test_zero/test_shard_model_v2.py
+0
-0
tests/test_zero/test_shard_param.py
tests/test_zero/test_shard_param.py
+1
-1
tests/test_zero/test_sharded_optim_v2.py
tests/test_zero/test_sharded_optim_v2.py
+0
-0
tests/test_zero/test_sharded_optim_with_sync_bn.py
tests/test_zero/test_sharded_optim_with_sync_bn.py
+0
-0
tests/test_zero/test_state_dict.py
tests/test_zero/test_state_dict.py
+0
-0
tests/test_zero/test_stateful_tensor_mgr.py
tests/test_zero/test_stateful_tensor_mgr.py
+0
-0
tests/test_zero/test_tensor_utils.py
tests/test_zero/test_tensor_utils.py
+4
-12
tests/test_zero/test_zero_engine.py
tests/test_zero/test_zero_engine.py
+0
-0
No files found.
colossalai/utils/memory.py
View file @
53cb5848
...
@@ -8,6 +8,7 @@ from colossalai.utils import get_current_device
...
@@ -8,6 +8,7 @@ from colossalai.utils import get_current_device
from
colossalai.core
import
global_context
as
gpc
from
colossalai.core
import
global_context
as
gpc
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.logging
import
get_dist_logger
from
colossalai.logging
import
get_dist_logger
from
packaging
import
version
_GLOBAL_CUDA_MEM_FRACTION
=
1.0
_GLOBAL_CUDA_MEM_FRACTION
=
1.0
...
@@ -106,7 +107,8 @@ def colo_device_memory_capacity(device: torch.device) -> int:
...
@@ -106,7 +107,8 @@ def colo_device_memory_capacity(device: torch.device) -> int:
assert
isinstance
(
device
,
torch
.
device
)
assert
isinstance
(
device
,
torch
.
device
)
if
device
.
type
==
'cpu'
:
if
device
.
type
==
'cpu'
:
mem_info
=
_get_cpu_memory_info
()
mem_info
=
_get_cpu_memory_info
()
return
mem_info
.
info
.
total
/
gpc
.
get_world_size
(
ParallelMode
.
DATA
)
# In the context of 1-CPU-N-GPU, the memory capacity of the current process is 1/N overall CPU memory.
return
mem_info
.
total
/
gpc
.
num_processes_on_current_node
if
device
.
type
==
'cuda'
:
if
device
.
type
==
'cuda'
:
return
torch
.
cuda
.
get_device_properties
(
get_current_device
()).
total_memory
*
_GLOBAL_CUDA_MEM_FRACTION
return
torch
.
cuda
.
get_device_properties
(
get_current_device
()).
total_memory
*
_GLOBAL_CUDA_MEM_FRACTION
...
@@ -123,8 +125,9 @@ def colo_device_memory_used(device: torch.device) -> int:
...
@@ -123,8 +125,9 @@ def colo_device_memory_used(device: torch.device) -> int:
"""
"""
if
device
.
type
==
'cpu'
:
if
device
.
type
==
'cpu'
:
mem_info
=
_get_cpu_memory_info
()
mem_info
=
_get_cpu_memory_info
()
# FIXME(jiaruifang) we need get how many processes are using the CPU memory.
# In the context of 1-CPU-N-GPU, the memory usage of the current process is 1/N CPU memory used.
ret
=
mem_info
.
used
/
gpc
.
get_world_size
(
ParallelMode
.
DATA
)
# Each process consumes the same amount of memory.
ret
=
mem_info
.
used
/
gpc
.
num_processes_on_current_node
return
ret
return
ret
elif
device
.
type
==
'cuda'
:
elif
device
.
type
==
'cuda'
:
ret
:
int
=
torch
.
cuda
.
memory_allocated
(
device
)
ret
:
int
=
torch
.
cuda
.
memory_allocated
(
device
)
...
@@ -142,6 +145,10 @@ def colo_set_process_memory_fraction(ratio: float) -> None:
...
@@ -142,6 +145,10 @@ def colo_set_process_memory_fraction(ratio: float) -> None:
Args:
Args:
ratio (float): a ratio between 0. ~ 1.
ratio (float): a ratio between 0. ~ 1.
"""
"""
if
version
.
parse
(
torch
.
__version__
)
<
version
.
parse
(
'1.8'
):
logger
=
get_dist_logger
(
'colo_set_process_memory_fraction'
)
logger
.
warning
(
'colo_set_process_memory_fraction failed because torch version is less than 1.8'
)
return
global
_GLOBAL_CUDA_MEM_FRACTION
global
_GLOBAL_CUDA_MEM_FRACTION
_GLOBAL_CUDA_MEM_FRACTION
=
ratio
_GLOBAL_CUDA_MEM_FRACTION
=
ratio
torch
.
cuda
.
set_per_process_memory_fraction
(
_GLOBAL_CUDA_MEM_FRACTION
,
get_current_device
())
torch
.
cuda
.
set_per_process_memory_fraction
(
_GLOBAL_CUDA_MEM_FRACTION
,
get_current_device
())
colossalai/utils/memory_tracer/async_memtracer.py
View file @
53cb5848
...
@@ -31,6 +31,7 @@ class AsyncMemoryMonitor:
...
@@ -31,6 +31,7 @@ class AsyncMemoryMonitor:
async_mem_monitor.finish()
async_mem_monitor.finish()
async_mem_monitor.save('log.pkl')
async_mem_monitor.save('log.pkl')
Args:
Args:
power (int, optional): the power of time interva. Defaults to 10.
power (int, optional): the power of time interva. Defaults to 10.
...
...
tests/test_moe/test_moe_zero_init.py
View file @
53cb5848
...
@@ -16,7 +16,7 @@ from colossalai.zero.shard_utils import (BucketTensorShardStrategy, TensorShardS
...
@@ -16,7 +16,7 @@ from colossalai.zero.shard_utils import (BucketTensorShardStrategy, TensorShardS
from
colossalai.testing
import
rerun_on_exception
from
colossalai.testing
import
rerun_on_exception
from
colossalai.utils
import
get_current_device
from
colossalai.utils
import
get_current_device
from
tests.test_zero
_data_parallel
.common
import
CONFIG
from
tests.test_zero.common
import
CONFIG
class
MoeModel
(
CheckpointModule
):
class
MoeModel
(
CheckpointModule
):
...
...
tests/test_moe/test_moe_zero_model.py
View file @
53cb5848
...
@@ -16,7 +16,7 @@ from colossalai.engine.gradient_handler import MoeGradientHandler
...
@@ -16,7 +16,7 @@ from colossalai.engine.gradient_handler import MoeGradientHandler
from
colossalai.context
import
MOE_CONTEXT
from
colossalai.context
import
MOE_CONTEXT
from
colossalai.testing
import
assert_equal_in_group
from
colossalai.testing
import
assert_equal_in_group
from
tests.test_zero
_data_parallel
.common
import
CONFIG
,
check_grads_padding
,
run_fwd_bwd
from
tests.test_zero.common
import
CONFIG
,
check_grads_padding
,
run_fwd_bwd
from
tests.test_moe.test_moe_zero_init
import
MoeModel
from
tests.test_moe.test_moe_zero_init
import
MoeModel
...
...
tests/test_moe/test_moe_zero_optim.py
View file @
53cb5848
...
@@ -20,7 +20,7 @@ from colossalai.engine.gradient_handler import MoeGradientHandler
...
@@ -20,7 +20,7 @@ from colossalai.engine.gradient_handler import MoeGradientHandler
from
colossalai.context
import
MOE_CONTEXT
from
colossalai.context
import
MOE_CONTEXT
from
colossalai.testing
import
assert_equal_in_group
from
colossalai.testing
import
assert_equal_in_group
from
tests.test_zero
_data_parallel
.common
import
CONFIG
,
check_sharded_model_params
from
tests.test_zero.common
import
CONFIG
,
check_sharded_model_params
from
tests.test_moe.test_moe_zero_init
import
MoeModel
from
tests.test_moe.test_moe_zero_init
import
MoeModel
...
...
tests/test_utils/test_memory.py
0 → 100644
View file @
53cb5848
import
pytest
import
colossalai
from
colossalai.utils.cuda
import
get_current_device
from
colossalai.utils.memory
import
colo_set_process_memory_fraction
,
colo_device_memory_capacity
from
colossalai.utils
import
free_port
from
functools
import
partial
import
torch.multiprocessing
as
mp
def
_run_colo_set_process_memory_fraction_and_colo_device_memory_capacity
():
frac1
=
colo_device_memory_capacity
(
get_current_device
())
colo_set_process_memory_fraction
(
0.5
)
frac2
=
colo_device_memory_capacity
(
get_current_device
())
assert
frac2
*
2
==
frac1
def
run_dist
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
_run_colo_set_process_memory_fraction_and_colo_device_memory_capacity
()
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
4
,
5
])
def
test_memory_utils
(
world_size
):
run_func
=
partial
(
run_dist
,
world_size
=
world_size
,
port
=
free_port
())
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
test_memory_utils
(
world_size
=
2
)
tests/test_zero
_data_parallel
/common.py
→
tests/test_zero/common.py
View file @
53cb5848
File moved
tests/test_zero
_data_parallel
/test_found_inf.py
→
tests/test_zero/test_found_inf.py
View file @
53cb5848
...
@@ -14,7 +14,7 @@ from colossalai.zero.sharded_model import ShardedModelV2
...
@@ -14,7 +14,7 @@ from colossalai.zero.sharded_model import ShardedModelV2
from
colossalai.zero.sharded_optim
import
ShardedOptimizerV2
from
colossalai.zero.sharded_optim
import
ShardedOptimizerV2
from
colossalai.zero.sharded_optim._utils
import
has_inf_or_nan
from
colossalai.zero.sharded_optim._utils
import
has_inf_or_nan
from
tests.components_to_test.registry
import
non_distributed_component_funcs
from
tests.components_to_test.registry
import
non_distributed_component_funcs
from
tests.test_zero
_data_parallel
.test_sharded_optim_v2
import
_run_step
from
tests.test_zero.test_sharded_optim_v2
import
_run_step
from
common
import
CONFIG
from
common
import
CONFIG
...
...
tests/test_zero
_data_parallel
/test_init_context.py
→
tests/test_zero/test_init_context.py
View file @
53cb5848
File moved
tests/test_zero
_data_parallel
/test_shard_model_v2.py
→
tests/test_zero/test_shard_model_v2.py
View file @
53cb5848
File moved
tests/test_zero
_data_parallel
/test_shard_param.py
→
tests/test_zero/test_shard_param.py
View file @
53cb5848
...
@@ -11,7 +11,7 @@ from colossalai.zero.shard_utils import (BucketTensorShardStrategy, TensorShardS
...
@@ -11,7 +11,7 @@ from colossalai.zero.shard_utils import (BucketTensorShardStrategy, TensorShardS
from
colossalai.zero.sharded_param
import
ShardedTensor
from
colossalai.zero.sharded_param
import
ShardedTensor
from
colossalai.zero.sharded_param.sharded_param
import
ShardedParamV2
from
colossalai.zero.sharded_param.sharded_param
import
ShardedParamV2
from
colossalai.testing
import
rerun_on_exception
from
colossalai.testing
import
rerun_on_exception
from
tests.test_zero
_data_parallel
.common
import
CONFIG
,
allclose
from
tests.test_zero.common
import
CONFIG
,
allclose
from
colossalai.zero.sharded_param.tensorful_state
import
StatefulTensor
from
colossalai.zero.sharded_param.tensorful_state
import
StatefulTensor
...
...
tests/test_zero
_data_parallel
/test_sharded_optim_v2.py
→
tests/test_zero/test_sharded_optim_v2.py
View file @
53cb5848
File moved
tests/test_zero
_data_parallel
/test_sharded_optim_with_sync_bn.py
→
tests/test_zero/test_sharded_optim_with_sync_bn.py
View file @
53cb5848
File moved
tests/test_zero
_data_parallel
/test_state_dict.py
→
tests/test_zero/test_state_dict.py
View file @
53cb5848
File moved
tests/test_zero
_data_parallel
/test_stateful_tensor_mgr.py
→
tests/test_zero/test_stateful_tensor_mgr.py
View file @
53cb5848
File moved
tests/test_
utils
/test_tensor_
move
.py
→
tests/test_
zero
/test_tensor_
utils
.py
View file @
53cb5848
...
@@ -5,7 +5,6 @@ from colossalai.utils.cuda import get_current_device
...
@@ -5,7 +5,6 @@ from colossalai.utils.cuda import get_current_device
from
colossalai.zero.sharded_param
import
(
StatefulTensor
,
colo_tensor_mem_usage
,
colo_model_data_tensor_move
,
from
colossalai.zero.sharded_param
import
(
StatefulTensor
,
colo_tensor_mem_usage
,
colo_model_data_tensor_move
,
colo_model_data_tensor_move_inline
,
colo_model_data_move_to_cpu
,
colo_model_data_tensor_move_inline
,
colo_model_data_move_to_cpu
,
colo_model_tensor_clone
)
colo_model_tensor_clone
)
from
colossalai.utils.memory
import
colo_set_process_memory_fraction
,
colo_device_memory_capacity
from
colossalai.utils
import
free_port
from
colossalai.utils
import
free_port
import
torch
import
torch
...
@@ -32,13 +31,6 @@ def _run_colo_tensor_mem_usage():
...
@@ -32,13 +31,6 @@ def _run_colo_tensor_mem_usage():
assert
g1
*
4
==
g2
assert
g1
*
4
==
g2
def
_run_colo_set_process_memory_fraction_and_colo_device_memory_capacity
():
frac1
=
colo_device_memory_capacity
(
get_current_device
())
colo_set_process_memory_fraction
(
0.5
)
frac2
=
colo_device_memory_capacity
(
get_current_device
())
assert
frac2
*
2
==
frac1
def
_run_colo_model_data_tensor_move_inline
():
def
_run_colo_model_data_tensor_move_inline
():
for
t
in
[
StatefulTensor
(
torch
.
randn
(
2
,
3
)),
torch
.
randn
(
2
,
3
)]:
for
t
in
[
StatefulTensor
(
torch
.
randn
(
2
,
3
)),
torch
.
randn
(
2
,
3
)]:
colo_model_data_tensor_move_inline
(
t
,
get_current_device
())
colo_model_data_tensor_move_inline
(
t
,
get_current_device
())
...
@@ -82,20 +74,20 @@ def _run_colo_model_tensor_clone():
...
@@ -82,20 +74,20 @@ def _run_colo_model_tensor_clone():
def
run_dist
(
rank
,
world_size
,
port
):
def
run_dist
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
_run_colo_set_process_memory_fraction_and_colo_device_memory_capacity
()
_run_colo_tensor_mem_usage
()
_run_colo_model_data_tensor_move_inline
()
_run_colo_model_data_tensor_move_inline
()
_run_colo_model_data_tensor_move
()
_run_colo_model_data_tensor_move
()
_run_colo_tensor_mem_usage
()
_run_colo_model_data_move_to_cpu
()
_run_colo_model_data_move_to_cpu
()
_run_colo_model_tensor_clone
()
_run_colo_model_tensor_clone
()
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
4
,
5
])
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
4
,
5
])
def
test_tensor_
move
(
world_size
):
def
test_
zero_
tensor_
utils
(
world_size
):
run_func
=
partial
(
run_dist
,
world_size
=
world_size
,
port
=
free_port
())
run_func
=
partial
(
run_dist
,
world_size
=
world_size
,
port
=
free_port
())
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
test_tensor_
move
(
4
)
test_
zero_
tensor_
utils
(
world_size
=
2
)
tests/test_zero
_data_parallel
/test_zero_engine.py
→
tests/test_zero/test_zero_engine.py
View file @
53cb5848
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment