"...lm-evaluation-harness.git" did not exist on "0d1ef03742d4986dfb85670971561760dda5f25a"
Unverified Commit 41b0564b authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

[bnb optim] fixing test (#21030)

* [bnb optim] fixing test

* force 1 gpu

* fix

* fix

* fix

* finalize

* improve commentary

* fix

* cleanup

* more fixes
parent 212829ad
......@@ -1044,10 +1044,14 @@ class Trainer:
manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
skipped = 0
for module in opt_model.modules():
if isinstance(module, nn.Embedding):
skipped += sum(dict((p.data_ptr(), p.numel()) for p in module.parameters()).values())
print(f"skipped {module}: {skipped/2**20}M params")
manager.register_module_override(module, "weight", {"optim_bits": 32})
logger.debug(f"bitsandbytes: will optimize {module} in fp32")
print(f"skipped: {skipped/2**20}M params")
if is_sagemaker_mp_enabled():
self.optimizer = smp.DistributedOptimizer(self.optimizer)
......
......@@ -17,11 +17,11 @@ import os
import re
import sys
import unittest
from pathlib import Path
from typing import Tuple
from unittest.mock import patch
from parameterized import parameterized
from transformers import AutoModel
from transformers.testing_utils import (
CaptureStderr,
ExtendSysPath,
......@@ -207,96 +207,97 @@ class TestTrainerExt(TestCasePlus):
from transformers.training_args import OptimizerNames
def train_and_return_metrics(optim: str) -> Tuple[int, float]:
from pathlib import Path
extra_args = (
f"--skip_memory_metrics 0 --optim {optim} --do_eval False --do_predict "
"False --adafactor False --log_level debug"
)
extra_args = "--skip_memory_metrics 0"
output_dir = self.run_trainer(
eval_steps=2,
max_len=128,
model_name=MARIAN_MODEL,
learning_rate=3e-4,
num_train_epochs=1,
optim=optim,
distributed=True, # force run in a new process
extra_args_str=extra_args,
do_eval=False,
do_predict=False,
n_gpus_to_use=1, # to allow deterministic fixed memory usage
)
# Check metrics
logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
gpu_peak_mem = logs[0]["train_mem_gpu_peaked_delta"]
gpu_alloc_mem = logs[0]["train_mem_gpu_alloc_delta"]
gpu_peak_mem_mb = int(logs[0]["train_mem_gpu_peaked_delta"] / 2**20)
gpu_alloc_mem_mb = int(logs[0]["train_mem_gpu_alloc_delta"] / 2**20)
loss = logs[0]["train_loss"]
return gpu_peak_mem, gpu_alloc_mem, loss
return gpu_peak_mem_mb, gpu_alloc_mem_mb, loss
gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)
gpu_peak_mem_diff_bytes = gpu_peak_mem_orig - gpu_peak_mem_bnb
gpu_peak_mem_diff_percent = gpu_peak_mem_diff_bytes / gpu_peak_mem_bnb
gpu_alloc_mem_diff = gpu_alloc_mem_orig - gpu_alloc_mem_bnb
gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb
gpu_total_mem_diff = gpu_total_mem_orig - gpu_total_mem_bnb
gpu_total_mem_diff_bytes = gpu_total_mem_orig - gpu_total_mem_bnb
gpu_total_mem_diff_percent = gpu_total_mem_diff_bytes / gpu_total_mem_bnb
# leave this for now if CI gets very different results
# print(f"{gpu_alloc_mem_orig=:010d} {gpu_peak_mem_orig=:010d} {gpu_alloc_mem_orig+gpu_peak_mem_orig=:010d}" )
# print(f" {gpu_alloc_mem_bnb=:010d} {gpu_peak_mem_bnb=:010d} {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=:010d}")
# print(f"{gpu_peak_mem_diff_bytes=}, {gpu_peak_mem_diff_percent=}")
# print(f"{gpu_total_mem_orig=}, {gpu_total_mem_bnb=}")
# print(f"{gpu_total_mem_diff_bytes=}, {gpu_total_mem_diff_percent=}")
# sshleifer/student_marian_en_ro_6_1 has 54M parameter, 29M of which is `nn.Embedding` which
# doesn't get quantized and remains in fp32. Therefore we only have 25M parameters quantized
# in 2 bytes and the diff in optim memory usage is derived as so:
#
# - normal 25*8=~200MB (8 bytes per param)
# - bnb 25*2= ~50MB (2 bytes per param)
#
# Thus we should expect ~150MB total memory saved.
#
# Peak memory should be the same - the total should be different by about that same margin
#
# After leaving a small margin to accommodate for differences between gpus let's check
# that we have at least 120MB in savings
expected_savings = 120
# uncomment the following if this test starts failing - requires py38 for a new print feature
# gpu_peak_mem_diff = gpu_peak_mem_orig - gpu_peak_mem_bnb
# print(f"{gpu_alloc_mem_orig=}MB {gpu_peak_mem_orig=}MB {gpu_alloc_mem_orig+gpu_peak_mem_orig=}MB")
# print(f" {gpu_alloc_mem_bnb=}MB {gpu_peak_mem_bnb=}MB {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=}MB")
# print(f"{gpu_alloc_mem_diff=}MB")
# print(f"{gpu_peak_mem_diff=}MB")
# print(f"{gpu_total_mem_orig=}MB, {gpu_total_mem_bnb=}MB")
# print(f"{gpu_total_mem_diff=}MB, {gpu_total_mem_diff=}MB")
self.assertGreater(
gpu_peak_mem_diff_percent,
10, # basically a huge difference - got ~30x on my desktop
"should use very little peak gpu memory with BNB, compared to without it"
f"but got gpu_peak_mem_orig={gpu_peak_mem_orig} and gpu_peak_mem_bnb={gpu_peak_mem_bnb}",
gpu_alloc_mem_diff,
expected_savings,
"should use ~150MB less alloc gpu memory with BNB, compared to without it for this model but got"
f" a difference of {gpu_alloc_mem_diff}MB, with gpu_alloc_mem_orig={gpu_alloc_mem_orig}MB and"
f" gpu_alloc_mem_bnb={gpu_alloc_mem_bnb}MB",
)
self.assertGreater(
gpu_total_mem_diff_percent,
0.20, # could easily be 0.50, but let's stay on the safe side
"Using BNB should use less total GPU memory than without it"
f"but got gpu_total_mem_orig={gpu_total_mem_orig} and gpu_total_mem_bnb={gpu_total_mem_bnb}",
gpu_total_mem_diff,
expected_savings,
"should use ~150MB less total gpu memory with BNB, compared to without it for this model but got"
f" a difference of {gpu_total_mem_diff}MB, with gpu_total_mem_orig={gpu_total_mem_orig}MB and"
f" gpu_total_mem_bnb={gpu_total_mem_bnb}MB",
)
self.assertEqual(
loss_orig, loss_bnb, f"loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
)
# Additionally let's test that the absolute gpu memory difference is larger or about the
# same as the expected saving coming from BNB (6 bytes per param)
model = AutoModel.from_pretrained(MARIAN_MODEL)
total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
bnb_saved_bytes = total_numel * 6 # 324MB
self.assertGreater(
gpu_total_mem_diff_bytes,
bnb_saved_bytes * 0.8, # add a safety margin, if it saved slightly less
f"BNB should have saved about {bnb_saved_bytes} bytes, but the saved bytes were"
f" {gpu_total_mem_diff_bytes}",
)
def run_trainer(
self,
eval_steps: int,
max_len: int,
model_name: str,
num_train_epochs: int,
learning_rate: float = 3e-3,
optim: str = "adafactor",
distributed: bool = False,
extra_args_str: str = None,
eval_steps: int = 0,
predict_with_generate: bool = True,
do_train: bool = True,
do_eval: bool = True,
do_predict: bool = True,
n_gpus_to_use: int = None,
):
data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro"
output_dir = self.get_auto_remove_tmp_dir()
......@@ -320,10 +321,9 @@ class TestTrainerExt(TestCasePlus):
--save_steps {str(eval_steps)}
--group_by_length
--label_smoothing_factor 0.1
--adafactor
--target_lang ro_RO
--source_lang en_XX
"""
""".split()
args_eval = f"""
--do_eval
......@@ -332,13 +332,13 @@ class TestTrainerExt(TestCasePlus):
--val_max_target_length {max_len}
--evaluation_strategy steps
--eval_steps {str(eval_steps)}
"""
""".split()
args_predict = """
--do_predict
"""
""".split()
args = ""
args = []
if do_train:
args += args_train
......@@ -349,19 +349,25 @@ class TestTrainerExt(TestCasePlus):
args += args_predict
if predict_with_generate:
args += "--predict_with_generate"
args += "--predict_with_generate".split()
args = args.split()
if do_train:
if optim == "adafactor":
args += "--adafactor".split()
else:
args += f"--optim {optim}".split()
if extra_args_str is not None:
args.extend(extra_args_str.split())
args += extra_args_str.split()
if distributed:
n_gpu = get_gpu_count()
if n_gpus_to_use is None:
n_gpus_to_use = get_gpu_count()
master_port = get_torch_dist_unique_port()
distributed_args = f"""
-m torch.distributed.launch
--nproc_per_node={n_gpu}
--nproc_per_node={n_gpus_to_use}
--master_port={master_port}
{self.examples_dir_str}/pytorch/translation/run_translation.py
""".split()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment