Unverified Commit 34a64064 authored by Zach Mueller's avatar Zach Mueller Committed by GitHub
Browse files

Save TB logs as part of push_to_hub (#27022)

* Support runs/

* Upload runs folder as part of push to hub

* Add a test

* Add to test deps

* Update with proposed solution from Slack

* Ensure that repo gets deleted in tests
parent 18925925
...@@ -167,6 +167,7 @@ _deps = [ ...@@ -167,6 +167,7 @@ _deps = [
"starlette", "starlette",
"sudachipy>=0.6.6", "sudachipy>=0.6.6",
"sudachidict_core>=20220729", "sudachidict_core>=20220729",
"tensorboard",
# TensorFlow pin. When changing this value, update examples/tensorflow/_tests_requirements.txt accordingly # TensorFlow pin. When changing this value, update examples/tensorflow/_tests_requirements.txt accordingly
"tensorflow-cpu>=2.6,<2.15", "tensorflow-cpu>=2.6,<2.15",
"tensorflow>=2.6,<2.15", "tensorflow>=2.6,<2.15",
...@@ -319,6 +320,7 @@ extras["testing"] = ( ...@@ -319,6 +320,7 @@ extras["testing"] = (
"sacremoses", "sacremoses",
"rjieba", "rjieba",
"beautifulsoup4", "beautifulsoup4",
"tensorboard",
) )
+ extras["retrieval"] + extras["retrieval"]
+ extras["modelcreation"] + extras["modelcreation"]
......
...@@ -73,6 +73,7 @@ deps = { ...@@ -73,6 +73,7 @@ deps = {
"starlette": "starlette", "starlette": "starlette",
"sudachipy": "sudachipy>=0.6.6", "sudachipy": "sudachipy>=0.6.6",
"sudachidict_core": "sudachidict_core>=20220729", "sudachidict_core": "sudachidict_core>=20220729",
"tensorboard": "tensorboard",
"tensorflow-cpu": "tensorflow-cpu>=2.6,<2.15", "tensorflow-cpu": "tensorflow-cpu>=2.6,<2.15",
"tensorflow": "tensorflow>=2.6,<2.15", "tensorflow": "tensorflow>=2.6,<2.15",
"tensorflow-text": "tensorflow-text<2.15", "tensorflow-text": "tensorflow-text<2.15",
......
...@@ -45,6 +45,7 @@ from .integrations import ( ...@@ -45,6 +45,7 @@ from .integrations import (
is_optuna_available, is_optuna_available,
is_ray_available, is_ray_available,
is_sigopt_available, is_sigopt_available,
is_tensorboard_available,
is_wandb_available, is_wandb_available,
) )
from .integrations.deepspeed import is_deepspeed_available from .integrations.deepspeed import is_deepspeed_available
...@@ -911,6 +912,13 @@ def require_optimum(test_case): ...@@ -911,6 +912,13 @@ def require_optimum(test_case):
return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case) return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case)
def require_tensorboard(test_case):
"""
Decorator for `tensorboard` dependency
"""
return unittest.skipUnless(is_tensorboard_available(), "test requires tensorboard")
def require_auto_gptq(test_case): def require_auto_gptq(test_case):
""" """
Decorator for auto_gptq dependency Decorator for auto_gptq dependency
......
...@@ -3560,7 +3560,7 @@ class Trainer: ...@@ -3560,7 +3560,7 @@ class Trainer:
commit_message=commit_message, commit_message=commit_message,
token=self.args.hub_token, token=self.args.hub_token,
run_as_future=True, run_as_future=True,
ignore_patterns=["_*", "**/*"], ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"],
) )
push_jobs = [model_push_job] push_jobs = [model_push_job]
...@@ -3630,14 +3630,13 @@ class Trainer: ...@@ -3630,14 +3630,13 @@ class Trainer:
# Wait for the current upload to be finished. # Wait for the current upload to be finished.
self._finish_current_push() self._finish_current_push()
return upload_folder( return upload_folder(
repo_id=self.hub_model_id, repo_id=self.hub_model_id,
folder_path=self.args.output_dir, folder_path=self.args.output_dir,
commit_message=commit_message, commit_message=commit_message,
token=self.args.hub_token, token=self.args.hub_token,
run_as_future=not blocking, run_as_future=not blocking,
ignore_patterns=["_*", "**/*"], ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"],
) )
# #
......
...@@ -30,7 +30,7 @@ from typing import Dict, List ...@@ -30,7 +30,7 @@ from typing import Dict, List
from unittest.mock import Mock, patch from unittest.mock import Mock, patch
import numpy as np import numpy as np
from huggingface_hub import HfFolder, delete_repo, list_repo_commits from huggingface_hub import HfFolder, delete_repo, list_repo_commits, list_repo_files
from parameterized import parameterized from parameterized import parameterized
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
...@@ -60,6 +60,7 @@ from transformers.testing_utils import ( ...@@ -60,6 +60,7 @@ from transformers.testing_utils import (
require_safetensors, require_safetensors,
require_sentencepiece, require_sentencepiece,
require_sigopt, require_sigopt,
require_tensorboard,
require_tokenizers, require_tokenizers,
require_torch, require_torch,
require_torch_bf16_cpu, require_torch_bf16_cpu,
...@@ -138,10 +139,13 @@ class RegressionDataset: ...@@ -138,10 +139,13 @@ class RegressionDataset:
class RegressionTrainingArguments(TrainingArguments): class RegressionTrainingArguments(TrainingArguments):
a: float = 0.0 a: float = 0.0
b: float = 0.0 b: float = 0.0
keep_report_to: bool = False
def __post_init__(self): def __post_init__(self):
super().__post_init__() super().__post_init__()
# save resources not dealing with reporting (also avoids the warning when it's not set) # save resources not dealing with reporting unless specified (also avoids the warning when it's not set)
# can be explicitly disabled via `keep_report_to`
if not self.keep_report_to:
self.report_to = [] self.report_to = []
...@@ -319,7 +323,9 @@ if is_torch_available(): ...@@ -319,7 +323,9 @@ if is_torch_available():
h = nn.functional.relu(self.linear2(x)) h = nn.functional.relu(self.linear2(x))
return self.ln2(x + h + self.bias) return self.ln2(x + h + self.bias)
def get_regression_trainer(a=0, b=0, double_output=False, train_len=64, eval_len=64, pretrained=True, **kwargs): def get_regression_trainer(
a=0, b=0, double_output=False, train_len=64, eval_len=64, pretrained=True, keep_report_to=False, **kwargs
):
label_names = kwargs.get("label_names", None) label_names = kwargs.get("label_names", None)
train_dataset = RegressionDataset(length=train_len, label_names=label_names) train_dataset = RegressionDataset(length=train_len, label_names=label_names)
eval_dataset = RegressionDataset(length=eval_len, label_names=label_names) eval_dataset = RegressionDataset(length=eval_len, label_names=label_names)
...@@ -340,7 +346,7 @@ if is_torch_available(): ...@@ -340,7 +346,7 @@ if is_torch_available():
output_dir = kwargs.pop("output_dir", "./regression") output_dir = kwargs.pop("output_dir", "./regression")
preprocess_logits_for_metrics = kwargs.pop("preprocess_logits_for_metrics", None) preprocess_logits_for_metrics = kwargs.pop("preprocess_logits_for_metrics", None)
args = RegressionTrainingArguments(output_dir, a=a, b=b, **kwargs) args = RegressionTrainingArguments(output_dir, a=a, b=b, keep_report_to=keep_report_to, **kwargs)
return Trainer( return Trainer(
model, model,
args, args,
...@@ -2155,7 +2161,7 @@ class TrainerIntegrationWithHubTester(unittest.TestCase): ...@@ -2155,7 +2161,7 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):
for model in ["test-trainer", "test-trainer-epoch", "test-trainer-step"]: for model in ["test-trainer", "test-trainer-epoch", "test-trainer-step", "test-trainer-tensorboard"]:
try: try:
delete_repo(token=cls._token, repo_id=model) delete_repo(token=cls._token, repo_id=model)
except HTTPError: except HTTPError:
...@@ -2264,6 +2270,28 @@ class TrainerIntegrationWithHubTester(unittest.TestCase): ...@@ -2264,6 +2270,28 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
for i in range(5, max_steps, 5): for i in range(5, max_steps, 5):
self.assertIn(f"Training in progress, step {i}", commits) self.assertIn(f"Training in progress, step {i}", commits)
@require_tensorboard
def test_push_to_hub_with_tensorboard_logs(self):
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, "test-trainer-tensorboard"),
hub_token=self._token,
save_strategy="epoch",
report_to=["tensorboard"],
keep_report_to=True,
)
trainer.train()
# Push the runs via `push_to_hub()`
trainer.push_to_hub()
files = list_repo_files(f"{USER}/test-trainer-tensorboard", token=self._token)
found_log = False
for f in files:
if len(f.split("runs")) > 1 and "events.out.tfevents" in f:
found_log = True
assert found_log is True, "No tensorboard log found in repo"
@require_torch @require_torch
@require_optuna @require_optuna
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment