Track the number of tokens seen to metrics (#27274)

* Add tokens seen * Address comments, add to TrainingArgs * Update log * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Use self.args * Fix docstring Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

Track the number of tokens seen to metrics (#27274)
* Add tokens seen * Address comments, add to TrainingArgs * Update log * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Use self.args * Fix docstring Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
2fc33ebe · Zach Mueller · GitHub · 303c1d69 · 2fc33ebe · 2fc33ebe
Unverified Commit 2fc33ebe authored Nov 14, 2023 by Zach Mueller Committed by GitHub Nov 14, 2023
Showing with 29 additions and 0 deletions

src/transformers/trainer.py src/transformers/trainer.py +13 -0

src/transformers/trainer_callback.py src/transformers/trainer_callback.py +3 -0

src/transformers/training_args.py src/transformers/training_args.py +13 -0

No files found.
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1838,6 +1838,17 @@ class Trainer:
            step = -1
            for step, inputs in enumerate(epoch_iterator):
                total_batched_samples += 1
+                if self.args.include_num_input_tokens_seen:
+                    main_input_name = getattr(self.model, "main_input_name", "input_ids")
+                    if main_input_name not in inputs:
+                        logger.warning(
+                            "Tried to track the number of tokens seen, however the current model is "
+                            "not configured properly to know what item is the input. To fix this, add "
+                            "a `main_input_name` attribute to the model class you are using."
+                        )
+                    else:
+                        self.state.num_input_tokens_seen += self.accelerator.gather(inputs[main_input_name]).numel()
                if rng_to_sync:
                    self._load_rng_state(resume_from_checkpoint)
                    rng_to_sync = False
@@ -2640,6 +2651,8 @@ class Trainer:
        """
        if self.state.epoch is not None:
            logs["epoch"] = round(self.state.epoch, 2)
+        if self.args.include_num_input_tokens_seen:
+            logs["num_input_tokens_seen"] = self.state.num_input_tokens_seen
        output = {**logs, **{"step": self.state.global_step}}
        self.state.log_history.append(output)

--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -59,6 +59,8 @@ class TrainerState:
            Run an evaluation every X steps.
        save_steps (`int`, *optional*, defaults to 500):
            Save checkpoint every X updates steps.
+        num_input_tokens_seen (`int`, *optional*, defaults to 0):
+            The number of tokens seen during training (number of input tokens, not the number of prediction tokens).
        total_flos (`float`, *optional*, defaults to 0):
            The total number of floating operations done by the model since the beginning of training (stored as floats
            to avoid overflow).
@@ -87,6 +89,7 @@ class TrainerState:
    eval_steps: int = 500
    save_steps: int = 500
    num_train_epochs: int = 0
+    num_input_tokens_seen: int = 0
    total_flos: float = 0
    log_history: List[Dict[str, float]] = None
    best_metric: Optional[float] = None

--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -637,6 +637,12 @@ class TrainingArguments:
            This will iterate over the entire training dataloader once beforehand,
            and will slow down the entire process.
+        include_num_input_tokens_seen (`bool`, *optional*):
+            Whether or not to track the number of input tokens seen throughout training.
+            May be slower in distributed training as gather operations must be called.
        neftune_noise_alpha (`Optional[float]`):
            If not `None`, this will activate NEFTune noise embeddings. This can drastically improve model performance
            for instruction fine-tuning. Check out the [original paper](https://arxiv.org/abs/2310.05914) and the
@@ -1258,6 +1264,13 @@ class TrainingArguments:
        metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."},
    )
+    include_num_input_tokens_seen: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "If set to `True`, will track the number of input tokens seen throughout training. (May be slower in distributed training)"
+        },
+    )
    neftune_noise_alpha: float = field(
        default=None,
        metadata={