Unverified Commit 41b0564b authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

[bnb optim] fixing test (#21030)

* [bnb optim] fixing test

* force 1 gpu

* fix

* fix

* fix

* finalize

* improve commentary

* fix

* cleanup

* more fixes
parent 212829ad
...@@ -1044,10 +1044,14 @@ class Trainer: ...@@ -1044,10 +1044,14 @@ class Trainer:
manager = bitsandbytes.optim.GlobalOptimManager.get_instance() manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
skipped = 0
for module in opt_model.modules(): for module in opt_model.modules():
if isinstance(module, nn.Embedding): if isinstance(module, nn.Embedding):
skipped += sum(dict((p.data_ptr(), p.numel()) for p in module.parameters()).values())
print(f"skipped {module}: {skipped/2**20}M params")
manager.register_module_override(module, "weight", {"optim_bits": 32}) manager.register_module_override(module, "weight", {"optim_bits": 32})
logger.debug(f"bitsandbytes: will optimize {module} in fp32") logger.debug(f"bitsandbytes: will optimize {module} in fp32")
print(f"skipped: {skipped/2**20}M params")
if is_sagemaker_mp_enabled(): if is_sagemaker_mp_enabled():
self.optimizer = smp.DistributedOptimizer(self.optimizer) self.optimizer = smp.DistributedOptimizer(self.optimizer)
......
...@@ -17,11 +17,11 @@ import os ...@@ -17,11 +17,11 @@ import os
import re import re
import sys import sys
import unittest import unittest
from pathlib import Path
from typing import Tuple from typing import Tuple
from unittest.mock import patch from unittest.mock import patch
from parameterized import parameterized from parameterized import parameterized
from transformers import AutoModel
from transformers.testing_utils import ( from transformers.testing_utils import (
CaptureStderr, CaptureStderr,
ExtendSysPath, ExtendSysPath,
...@@ -207,96 +207,97 @@ class TestTrainerExt(TestCasePlus): ...@@ -207,96 +207,97 @@ class TestTrainerExt(TestCasePlus):
from transformers.training_args import OptimizerNames from transformers.training_args import OptimizerNames
def train_and_return_metrics(optim: str) -> Tuple[int, float]: def train_and_return_metrics(optim: str) -> Tuple[int, float]:
from pathlib import Path extra_args = "--skip_memory_metrics 0"
extra_args = (
f"--skip_memory_metrics 0 --optim {optim} --do_eval False --do_predict "
"False --adafactor False --log_level debug"
)
output_dir = self.run_trainer( output_dir = self.run_trainer(
eval_steps=2,
max_len=128, max_len=128,
model_name=MARIAN_MODEL, model_name=MARIAN_MODEL,
learning_rate=3e-4, learning_rate=3e-4,
num_train_epochs=1, num_train_epochs=1,
optim=optim,
distributed=True, # force run in a new process distributed=True, # force run in a new process
extra_args_str=extra_args, extra_args_str=extra_args,
do_eval=False, do_eval=False,
do_predict=False, do_predict=False,
n_gpus_to_use=1, # to allow deterministic fixed memory usage
) )
# Check metrics # Check metrics
logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
gpu_peak_mem = logs[0]["train_mem_gpu_peaked_delta"] gpu_peak_mem_mb = int(logs[0]["train_mem_gpu_peaked_delta"] / 2**20)
gpu_alloc_mem = logs[0]["train_mem_gpu_alloc_delta"] gpu_alloc_mem_mb = int(logs[0]["train_mem_gpu_alloc_delta"] / 2**20)
loss = logs[0]["train_loss"] loss = logs[0]["train_loss"]
return gpu_peak_mem, gpu_alloc_mem, loss return gpu_peak_mem_mb, gpu_alloc_mem_mb, loss
gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value) gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value) gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)
gpu_peak_mem_diff_bytes = gpu_peak_mem_orig - gpu_peak_mem_bnb gpu_alloc_mem_diff = gpu_alloc_mem_orig - gpu_alloc_mem_bnb
gpu_peak_mem_diff_percent = gpu_peak_mem_diff_bytes / gpu_peak_mem_bnb
gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb
gpu_total_mem_diff = gpu_total_mem_orig - gpu_total_mem_bnb
gpu_total_mem_diff_bytes = gpu_total_mem_orig - gpu_total_mem_bnb # sshleifer/student_marian_en_ro_6_1 has 54M parameter, 29M of which is `nn.Embedding` which
gpu_total_mem_diff_percent = gpu_total_mem_diff_bytes / gpu_total_mem_bnb # doesn't get quantized and remains in fp32. Therefore we only have 25M parameters quantized
# in 2 bytes and the diff in optim memory usage is derived as so:
# leave this for now if CI gets very different results #
# print(f"{gpu_alloc_mem_orig=:010d} {gpu_peak_mem_orig=:010d} {gpu_alloc_mem_orig+gpu_peak_mem_orig=:010d}" ) # - normal 25*8=~200MB (8 bytes per param)
# print(f" {gpu_alloc_mem_bnb=:010d} {gpu_peak_mem_bnb=:010d} {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=:010d}") # - bnb 25*2= ~50MB (2 bytes per param)
# print(f"{gpu_peak_mem_diff_bytes=}, {gpu_peak_mem_diff_percent=}") #
# print(f"{gpu_total_mem_orig=}, {gpu_total_mem_bnb=}") # Thus we should expect ~150MB total memory saved.
# print(f"{gpu_total_mem_diff_bytes=}, {gpu_total_mem_diff_percent=}") #
# Peak memory should be the same - the total should be different by about that same margin
#
# After leaving a small margin to accommodate for differences between gpus let's check
# that we have at least 120MB in savings
expected_savings = 120
# uncomment the following if this test starts failing - requires py38 for a new print feature
# gpu_peak_mem_diff = gpu_peak_mem_orig - gpu_peak_mem_bnb
# print(f"{gpu_alloc_mem_orig=}MB {gpu_peak_mem_orig=}MB {gpu_alloc_mem_orig+gpu_peak_mem_orig=}MB")
# print(f" {gpu_alloc_mem_bnb=}MB {gpu_peak_mem_bnb=}MB {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=}MB")
# print(f"{gpu_alloc_mem_diff=}MB")
# print(f"{gpu_peak_mem_diff=}MB")
# print(f"{gpu_total_mem_orig=}MB, {gpu_total_mem_bnb=}MB")
# print(f"{gpu_total_mem_diff=}MB, {gpu_total_mem_diff=}MB")
self.assertGreater( self.assertGreater(
gpu_peak_mem_diff_percent, gpu_alloc_mem_diff,
10, # basically a huge difference - got ~30x on my desktop expected_savings,
"should use very little peak gpu memory with BNB, compared to without it" "should use ~150MB less alloc gpu memory with BNB, compared to without it for this model but got"
f"but got gpu_peak_mem_orig={gpu_peak_mem_orig} and gpu_peak_mem_bnb={gpu_peak_mem_bnb}", f" a difference of {gpu_alloc_mem_diff}MB, with gpu_alloc_mem_orig={gpu_alloc_mem_orig}MB and"
f" gpu_alloc_mem_bnb={gpu_alloc_mem_bnb}MB",
) )
self.assertGreater( self.assertGreater(
gpu_total_mem_diff_percent, gpu_total_mem_diff,
0.20, # could easily be 0.50, but let's stay on the safe side expected_savings,
"Using BNB should use less total GPU memory than without it" "should use ~150MB less total gpu memory with BNB, compared to without it for this model but got"
f"but got gpu_total_mem_orig={gpu_total_mem_orig} and gpu_total_mem_bnb={gpu_total_mem_bnb}", f" a difference of {gpu_total_mem_diff}MB, with gpu_total_mem_orig={gpu_total_mem_orig}MB and"
f" gpu_total_mem_bnb={gpu_total_mem_bnb}MB",
) )
self.assertEqual( self.assertEqual(
loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}" loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
) )
# Additionally let's test that the absolute gpu memory difference is larger or about the
# same as the expected saving coming from BNB (6 bytes per param)
model = AutoModel.from_pretrained(MARIAN_MODEL)
total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
bnb_saved_bytes = total_numel * 6 # 324MB
self.assertGreater(
gpu_total_mem_diff_bytes,
bnb_saved_bytes * 0.8, # add a safety margin, if it saved slightly less
f"BNB should have saved about {bnb_saved_bytes} bytes, but the saved bytes were"
f" {gpu_total_mem_diff_bytes}",
)
def run_trainer( def run_trainer(
self, self,
eval_steps: int,
max_len: int, max_len: int,
model_name: str, model_name: str,
num_train_epochs: int, num_train_epochs: int,
learning_rate: float = 3e-3, learning_rate: float = 3e-3,
optim: str = "adafactor",
distributed: bool = False, distributed: bool = False,
extra_args_str: str = None, extra_args_str: str = None,
eval_steps: int = 0,
predict_with_generate: bool = True, predict_with_generate: bool = True,
do_train: bool = True, do_train: bool = True,
do_eval: bool = True, do_eval: bool = True,
do_predict: bool = True, do_predict: bool = True,
n_gpus_to_use: int = None,
): ):
data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro" data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
output_dir = self.get_auto_remove_tmp_dir() output_dir = self.get_auto_remove_tmp_dir()
...@@ -320,10 +321,9 @@ class TestTrainerExt(TestCasePlus): ...@@ -320,10 +321,9 @@ class TestTrainerExt(TestCasePlus):
--save_steps {str(eval_steps)} --save_steps {str(eval_steps)}
--group_by_length --group_by_length
--label_smoothing_factor 0.1 --label_smoothing_factor 0.1
--adafactor
--target_lang ro_RO --target_lang ro_RO
--source_lang en_XX --source_lang en_XX
""" """.split()
args_eval = f""" args_eval = f"""
--do_eval --do_eval
...@@ -332,13 +332,13 @@ class TestTrainerExt(TestCasePlus): ...@@ -332,13 +332,13 @@ class TestTrainerExt(TestCasePlus):
--val_max_target_length {max_len} --val_max_target_length {max_len}
--evaluation_strategy steps --evaluation_strategy steps
--eval_steps {str(eval_steps)} --eval_steps {str(eval_steps)}
""" """.split()
args_predict = """ args_predict = """
--do_predict --do_predict
""" """.split()
args = "" args = []
if do_train: if do_train:
args += args_train args += args_train
...@@ -349,19 +349,25 @@ class TestTrainerExt(TestCasePlus): ...@@ -349,19 +349,25 @@ class TestTrainerExt(TestCasePlus):
args += args_predict args += args_predict
if predict_with_generate: if predict_with_generate:
args += "--predict_with_generate" args += "--predict_with_generate".split()
args = args.split() if do_train:
if optim == "adafactor":
args += "--adafactor".split()
else:
args += f"--optim {optim}".split()
if extra_args_str is not None: if extra_args_str is not None:
args.extend(extra_args_str.split()) args += extra_args_str.split()
if distributed: if distributed:
n_gpu = get_gpu_count()
if n_gpus_to_use is None:
n_gpus_to_use = get_gpu_count()
master_port = get_torch_dist_unique_port() master_port = get_torch_dist_unique_port()
distributed_args = f""" distributed_args = f"""
-m torch.distributed.launch -m torch.distributed.launch
--nproc_per_node={n_gpu} --nproc_per_node={n_gpus_to_use}
--master_port={master_port} --master_port={master_port}
{self.examples_dir_str}/pytorch/translation/run_translation.py {self.examples_dir_str}/pytorch/translation/run_translation.py
""".split() """.split()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment