"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "7d683f7baeed1f00e03555d58d32a6213263ef5f"
Unverified Commit 2fc33ebe authored by Zach Mueller's avatar Zach Mueller Committed by GitHub
Browse files

Track the number of tokens seen to metrics (#27274)



* Add tokens seen

* Address comments, add to TrainingArgs

* Update log

* Apply suggestions from code review
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Use self.args

* Fix docstring
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
parent 303c1d69
...@@ -1838,6 +1838,17 @@ class Trainer: ...@@ -1838,6 +1838,17 @@ class Trainer:
step = -1 step = -1
for step, inputs in enumerate(epoch_iterator): for step, inputs in enumerate(epoch_iterator):
total_batched_samples += 1 total_batched_samples += 1
if self.args.include_num_input_tokens_seen:
main_input_name = getattr(self.model, "main_input_name", "input_ids")
if main_input_name not in inputs:
logger.warning(
"Tried to track the number of tokens seen, however the current model is "
"not configured properly to know what item is the input. To fix this, add "
"a `main_input_name` attribute to the model class you are using."
)
else:
self.state.num_input_tokens_seen += self.accelerator.gather(inputs[main_input_name]).numel()
if rng_to_sync: if rng_to_sync:
self._load_rng_state(resume_from_checkpoint) self._load_rng_state(resume_from_checkpoint)
rng_to_sync = False rng_to_sync = False
...@@ -2640,6 +2651,8 @@ class Trainer: ...@@ -2640,6 +2651,8 @@ class Trainer:
""" """
if self.state.epoch is not None: if self.state.epoch is not None:
logs["epoch"] = round(self.state.epoch, 2) logs["epoch"] = round(self.state.epoch, 2)
if self.args.include_num_input_tokens_seen:
logs["num_input_tokens_seen"] = self.state.num_input_tokens_seen
output = {**logs, **{"step": self.state.global_step}} output = {**logs, **{"step": self.state.global_step}}
self.state.log_history.append(output) self.state.log_history.append(output)
......
...@@ -59,6 +59,8 @@ class TrainerState: ...@@ -59,6 +59,8 @@ class TrainerState:
Run an evaluation every X steps. Run an evaluation every X steps.
save_steps (`int`, *optional*, defaults to 500): save_steps (`int`, *optional*, defaults to 500):
Save checkpoint every X updates steps. Save checkpoint every X updates steps.
num_input_tokens_seen (`int`, *optional*, defaults to 0):
The number of tokens seen during training (number of input tokens, not the number of prediction tokens).
total_flos (`float`, *optional*, defaults to 0): total_flos (`float`, *optional*, defaults to 0):
The total number of floating operations done by the model since the beginning of training (stored as floats The total number of floating operations done by the model since the beginning of training (stored as floats
to avoid overflow). to avoid overflow).
...@@ -87,6 +89,7 @@ class TrainerState: ...@@ -87,6 +89,7 @@ class TrainerState:
eval_steps: int = 500 eval_steps: int = 500
save_steps: int = 500 save_steps: int = 500
num_train_epochs: int = 0 num_train_epochs: int = 0
num_input_tokens_seen: int = 0
total_flos: float = 0 total_flos: float = 0
log_history: List[Dict[str, float]] = None log_history: List[Dict[str, float]] = None
best_metric: Optional[float] = None best_metric: Optional[float] = None
......
...@@ -637,6 +637,12 @@ class TrainingArguments: ...@@ -637,6 +637,12 @@ class TrainingArguments:
This will iterate over the entire training dataloader once beforehand, This will iterate over the entire training dataloader once beforehand,
and will slow down the entire process. and will slow down the entire process.
include_num_input_tokens_seen (`bool`, *optional*):
Whether or not to track the number of input tokens seen throughout training.
May be slower in distributed training as gather operations must be called.
neftune_noise_alpha (`Optional[float]`): neftune_noise_alpha (`Optional[float]`):
If not `None`, this will activate NEFTune noise embeddings. This can drastically improve model performance If not `None`, this will activate NEFTune noise embeddings. This can drastically improve model performance
for instruction fine-tuning. Check out the [original paper](https://arxiv.org/abs/2310.05914) and the for instruction fine-tuning. Check out the [original paper](https://arxiv.org/abs/2310.05914) and the
...@@ -1258,6 +1264,13 @@ class TrainingArguments: ...@@ -1258,6 +1264,13 @@ class TrainingArguments:
metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."}, metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."},
) )
include_num_input_tokens_seen: Optional[bool] = field(
default=False,
metadata={
"help": "If set to `True`, will track the number of input tokens seen throughout training. (May be slower in distributed training)"
},
)
neftune_noise_alpha: float = field( neftune_noise_alpha: float = field(
default=None, default=None,
metadata={ metadata={
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment