Unverified Commit 143289dc authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

[test_model_parallelization] multiple fixes (#9354)

parent 086718ac
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
# limitations under the License. # limitations under the License.
import copy import copy
import gc
import inspect import inspect
import os.path import os.path
import random import random
...@@ -1081,15 +1082,15 @@ class ModelTesterMixin: ...@@ -1081,15 +1082,15 @@ class ModelTesterMixin:
if not self.test_model_parallel: if not self.test_model_parallel:
return return
import subprocess # a candidate for testing_utils
def get_current_gpu_memory_use(): def get_current_gpu_memory_use():
run_process = subprocess.Popen( """ returns a list of cuda memory allocations per GPU in MBs"""
"nvidia-smi --query-gpu=memory.used --format=csv,nounits,noheader", shell=True, stdout=subprocess.PIPE
) per_device_memory = []
for id in range(torch.cuda.device_count()):
with torch.cuda.device(id):
per_device_memory.append(torch.cuda.memory_allocated() >> 20)
memory_usage = run_process.stdout.read().decode("utf-8").strip()
per_device_memory = [int(memory) for memory in memory_usage.split("\n")]
return per_device_memory return per_device_memory
# Needs a large model to see the difference. # Needs a large model to see the difference.
...@@ -1098,39 +1099,44 @@ class ModelTesterMixin: ...@@ -1098,39 +1099,44 @@ class ModelTesterMixin:
for model_class in self.all_parallelizable_model_classes: for model_class in self.all_parallelizable_model_classes:
torch.cuda.empty_cache() torch.cuda.empty_cache()
# Retrieve initial memory usage (should be close to 0) # 1. single gpu memory load + unload + memory measurements
initial_memory = get_current_gpu_memory_use() # Retrieve initial memory usage (can easily be ~0.6-1.5GB if cuda-kernels have been preloaded by previous tests)
memory_at_start = get_current_gpu_memory_use()
# Put model on device # Put model on device 0 and take a memory snapshot
model = model_class(config.from_pretrained("gpt2")) model = model_class(config)
model.to("cuda:0") model.to("cuda:0")
# Retrieve the memory after the model is put on the device
memory_after_model_load = get_current_gpu_memory_use() memory_after_model_load = get_current_gpu_memory_use()
# The memory use on device 0 should be higher than it was initially.
self.assertGreater(memory_after_model_load[0], memory_at_start[0])
del model del model
gc.collect()
torch.cuda.empty_cache() torch.cuda.empty_cache()
# The memory use on that device should be higher than it was initially. # 2. MP test
self.assertGreater(memory_after_model_load[0], initial_memory[0]) # it's essential to re-calibrate the usage before the next stage
memory_at_start = get_current_gpu_memory_use()
# Spread model layers over multiple devices # Spread model layers over multiple devices
model = model_class(config.from_pretrained("gpt2")) model = model_class(config)
model.parallelize() model.parallelize()
memory_after_parallelization = get_current_gpu_memory_use() memory_after_parallelization = get_current_gpu_memory_use()
# Assert that the memory use on all devices is higher than it was when loaded only on CPU # Assert that the memory use on all devices is higher than it was when loaded only on CPU
for n in range(torch.cuda.device_count()): for n in range(torch.cuda.device_count()):
self.assertGreater(memory_after_parallelization[n], initial_memory[n]) self.assertGreater(memory_after_parallelization[n], memory_at_start[n])
# Assert that the memory use of the first device is lower than it was when the entire model was loaded on it # Assert that the memory use of device 0 is lower than it was when the entire model was loaded on it
self.assertLess(memory_after_parallelization[0], memory_after_model_load[0]) self.assertLess(memory_after_parallelization[0], memory_after_model_load[0])
# Assert that the memory use of the second device is higher than it was when the entire model was loaded # Assert that the memory use of device 1 is higher than it was when the entire model was loaded
# on the other device. # on device 0 and device 1 wasn't used at all
self.assertGreater(memory_after_parallelization[1], memory_after_model_load[1]) self.assertGreater(memory_after_parallelization[1], memory_after_model_load[1])
del model del model
gc.collect()
torch.cuda.empty_cache() torch.cuda.empty_cache()
@require_torch_multi_gpu @require_torch_multi_gpu
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment