Unverified Commit 2fc33ebe authored by Zach Mueller's avatar Zach Mueller Committed by GitHub
Browse files

Track the number of tokens seen to metrics (#27274)



* Add tokens seen

* Address comments, add to TrainingArgs

* Update log

* Apply suggestions from code review
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Use self.args

* Fix docstring
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
parent 303c1d69
......@@ -1838,6 +1838,17 @@ class Trainer:
step = -1
for step, inputs in enumerate(epoch_iterator):
total_batched_samples += 1
if self.args.include_num_input_tokens_seen:
main_input_name = getattr(self.model, "main_input_name", "input_ids")
if main_input_name not in inputs:
logger.warning(
"Tried to track the number of tokens seen, however the current model is "
"not configured properly to know what item is the input. To fix this, add "
"a `main_input_name` attribute to the model class you are using."
)
else:
self.state.num_input_tokens_seen += self.accelerator.gather(inputs[main_input_name]).numel()
if rng_to_sync:
self._load_rng_state(resume_from_checkpoint)
rng_to_sync = False
......@@ -2640,6 +2651,8 @@ class Trainer:
"""
if self.state.epoch is not None:
logs["epoch"] = round(self.state.epoch, 2)
if self.args.include_num_input_tokens_seen:
logs["num_input_tokens_seen"] = self.state.num_input_tokens_seen
output = {**logs, **{"step": self.state.global_step}}
self.state.log_history.append(output)
......
......@@ -59,6 +59,8 @@ class TrainerState:
Run an evaluation every X steps.
save_steps (`int`, *optional*, defaults to 500):
Save checkpoint every X updates steps.
num_input_tokens_seen (`int`, *optional*, defaults to 0):
The number of tokens seen during training (number of input tokens, not the number of prediction tokens).
total_flos (`float`, *optional*, defaults to 0):
The total number of floating operations done by the model since the beginning of training (stored as floats
to avoid overflow).
......@@ -87,6 +89,7 @@ class TrainerState:
eval_steps: int = 500
save_steps: int = 500
num_train_epochs: int = 0
num_input_tokens_seen: int = 0
total_flos: float = 0
log_history: List[Dict[str, float]] = None
best_metric: Optional[float] = None
......
......@@ -637,6 +637,12 @@ class TrainingArguments:
This will iterate over the entire training dataloader once beforehand,
and will slow down the entire process.
include_num_input_tokens_seen (`bool`, *optional*):
Whether or not to track the number of input tokens seen throughout training.
May be slower in distributed training as gather operations must be called.
neftune_noise_alpha (`Optional[float]`):
If not `None`, this will activate NEFTune noise embeddings. This can drastically improve model performance
for instruction fine-tuning. Check out the [original paper](https://arxiv.org/abs/2310.05914) and the
......@@ -1258,6 +1264,13 @@ class TrainingArguments:
metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."},
)
include_num_input_tokens_seen: Optional[bool] = field(
default=False,
metadata={
"help": "If set to `True`, will track the number of input tokens seen throughout training. (May be slower in distributed training)"
},
)
neftune_noise_alpha: float = field(
default=None,
metadata={
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment