[s2s] add create student script (#7290)

Co-authored-by: Suraj Patil <surajp815@gmail.com> Co-authored-by: Sam Shleifer <sshleifer@gmail.com>

[s2s] add create student script (#7290)
Co-authored-by: Suraj Patil <surajp815@gmail.com> Co-authored-by: Sam Shleifer <sshleifer@gmail.com>
eab5f596 · Suraj Patil · GitHub · e50a931c · eab5f596 · eab5f596
Unverified Commit eab5f596 authored Sep 28, 2020 by Suraj Patil Committed by GitHub Sep 27, 2020
9 changed files
--- a/examples/seq2seq/README.md
+++ b/examples/seq2seq/README.md
@@ -369,7 +369,7 @@ runtime: 6H on NVIDIA RTX 24GB GPU
 If you are using `wandb` and comparing the two distillation methods, using this entry point will make your logs consistent,
 because you will have the same hyperparameters logged in every run.
-#### With a teacher
+#### With a teacher (Intermediate Supervision)
 *Note* only BART variants are supported
 In this method, we use try to enforce that the student and teacher produce similar encoder_outputs, logits, and hidden_states using `BartSummarizationDistiller`.
@@ -378,7 +378,7 @@ This is how `sshleifer/distilbart-xsum*` checkpoints were produced.
 The command that produced `sshleifer/distilbart-xsum-12-6` is:
 ```bash
-./train_distilbart_xsum.sh
+./train_distilbart_xsum.sh --logger_name wandb --gpus 1
 ```
 runtime: 13H on V-100 16GB GPU.

--- a/examples/seq2seq/distillation.py
+++ b/examples/seq2seq/distillation.py
--- a/examples/seq2seq/initialization_utils.py
+++ b/examples/seq2seq/initialization_utils.py
-from typing import List
-from torch import nn
-def init_student(student, teacher):
-    teacher_state_dict = teacher.state_dict()
-    info = student.load_state_dict(teacher_state_dict, strict=False)
-    assert info.missing_keys == [], info.missing_keys
-    return student, info
-def copy_decoder_layers(teacher, student, l2copy=[0, 2, 4, 7, 9, 11]):
-    copy_layers(teacher.model.decoder.layers, student.model.decoder.layers, l2copy)
-def copy_layers(teacher_layers: nn.ModuleList, student_layers: nn.ModuleList, layers_to_copy: List) -> None:
-    layers_to_copy = nn.ModuleList([l for i, l in enumerate(teacher_layers) if i in layers_to_copy])
-    assert len(student_layers) == len(layers_to_copy), f"{len(student_layers)} != {len(layers_to_copy)}"
-    student_layers.load_state_dict(layers_to_copy.state_dict())
--- a/examples/seq2seq/make_student.py
+++ b/examples/seq2seq/make_student.py
+import warnings
+from pathlib import Path
+from typing import List, Tuple, Union
+import fire
+from torch import nn
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+def copy_layers(src_layers: nn.ModuleList, dest_layers: nn.ModuleList, layers_to_copy: List[int]) -> None:
+    layers_to_copy = nn.ModuleList([l for i, l in enumerate(src_layers) if i in layers_to_copy])
+    assert len(dest_layers) == len(layers_to_copy), f"{len(dest_layers)} != {len(layers_to_copy)}"
+    dest_layers.load_state_dict(layers_to_copy.state_dict())
+LAYERS_TO_COPY = {
+    # maps  num layers in teacher -> num_layers in student -> which teacher layers to copy.
+    # 12: bart, 16: pegasus, 6: marian/Helsinki-NLP
+    12: {
+        1: [0],  # This says that if the teacher has 12 layers and the student has 1, copy layer 0 of the teacher
+        2: [0, 6],
+        3: [0, 6, 11],
+        4: [0, 4, 8, 11],
+        6: [0, 2, 4, 7, 9, 11],
+        9: [0, 1, 2, 4, 5, 7, 9, 10, 11],
+        12: list(range(12)),
+    },
+    16: {  # maps  num layers in student -> which teacher layers to copy
+        1: [0],
+        2: [0, 8],
+        3: [0, 8, 15],
+        4: [0, 5, 10, 15],
+        6: [0, 3, 6, 9, 12, 15],
+        8: [0, 2, 4, 6, 8, 10, 12, 15],
+        9: [0, 1, 3, 5, 7, 9, 11, 13, 15],
+        12: [0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 13, 15],
+        16: list(range(16)),
+    },
+    6: {1: [0], 2: [0, 5], 3: [0, 2, 5], 4: [0, 1, 3, 5], 6: list(range(6))},
+}
+LAYERS_TO_SUPERVISE = {
+    # maps  num layers in student -> which teacher layers to copy.
+    6: {1: [5], 2: [3, 5], 3: [1, 4, 5], 4: [1, 2, 4, 5]},
+    12: {1: [11], 2: [5, 11], 3: [3, 7, 11], 6: [1, 3, 5, 8, 10, 11]},
+    16: {1: [15], 4: [4, 9, 12, 15], 8: [1, 3, 5, 7, 9, 11, 13, 15]},
+}
+def pick_layers_to_copy(n_student, n_teacher):
+    try:
+        val = LAYERS_TO_COPY[n_teacher][n_student]
+        return val
+    except KeyError:
+        if n_student != n_teacher:
+            warnings.warn(
+                f"no hardcoded layers to copy for teacher {n_teacher} -> student {n_student}, defaulting to first {n_student}"
+            )
+        return list(range(n_student))
+def get_layers_to_supervise(n_student, n_teacher) -> List[int]:
+    """Used or the --supervise_forward kwarg"""
+    if n_student > n_teacher:
+        raise ValueError(f"Cannot perform intermediate supervision for student {n_student} > teacher {n_teacher}")
+    elif n_teacher == n_student:
+        return list(range(n_teacher))
+    elif n_student == 1:
+        return [n_teacher - 1]
+    else:
+        return LAYERS_TO_SUPERVISE[n_teacher][n_student]
+def create_student_by_copying_alternating_layers(
+    teacher: Union[str, PreTrainedModel],
+    save_path: Union[str, Path] = "student",
+    e: Union[int, None] = None,
+    d: Union[int, None] = None,
+    copy_first_teacher_layers=False,
+    **extra_config_kwargs
+) -> Tuple[PreTrainedModel, List[int], List[int]]:
+    """Make a student by copying alternating layers from a teacher, save it to save_path.
+    Args:
+        teacher: str or PreTrainedModel if str, this will call AutoModelForSeq2SeqLM.from_pretrained(teacher) before
+        copying layers
+        save_path: where to save the student, defaults to student directory.
+        e: how many Encoder layers should the student have, default is fully copy of teacher
+        d: how many Decoder layers should the student have, default is fully copy of teacher
+        copy_first_teacher_layers: [bool] dont copy alternating layers, just the first e/d.
+        **extra_config_kwargs: extra kwargs to pass to the student, by default the teacher config is used.
+    Returns:
+        student: new, smaller model.  (Also saves it to save_path)
+        e_layers_to_copy: list of which teacher encoder layers were used
+        d_layers_to_copy: list of which teacher decoder layers were used
+    """
+    _msg = "encoder_layers and decoder_layers cannot be both None-- you would just have an identical teacher."
+    assert (e is not None) or (d is not None), _msg
+    if isinstance(teacher, str):
+        AutoTokenizer.from_pretrained(teacher).save_pretrained(save_path)  # purely for convenience
+        teacher = AutoModelForSeq2SeqLM.from_pretrained(teacher).eval()
+    else:
+        assert isinstance(teacher, PreTrainedModel), f"teacher must be a model or string got type {type(teacher)}"
+    init_kwargs = teacher.config.to_diff_dict()
+    try:
+        teacher_e, teacher_d = teacher.config.encoder_layers, teacher.config.decoder_layers
+        if e is None:
+            e = teacher_e
+        if d is None:
+            d = teacher_d
+        init_kwargs.update({"encoder_layers": e, "decoder_layers": d})
+    except AttributeError:  # T5
+        teacher_e, teacher_d = teacher.config.num_layers, teacher.config.num_hidden_layers
+        assert e == d, "T5 Students must be symmetric"
+        init_kwargs["num_layers"] = e
+    # Kwargs to instantiate student = teacher kwargs with updated layer numbers + **extra_config_kwargs
+    init_kwargs.update(extra_config_kwargs)
+    # Copy weights
+    student_cfg = teacher.config_class(**init_kwargs)
+    student = AutoModelForSeq2SeqLM.from_config(student_cfg)
+    # Start by copying the full teacher state dict this will copy the first N teacher layers to the student.
+    info = student.load_state_dict(teacher.state_dict(), strict=False)
+    assert info.missing_keys == [], info.missing_keys  # every student key should have a teacher keys.
+    if copy_first_teacher_layers:  # Our copying is done. We just log and save
+        e_layers_to_copy, d_layers_to_copy = list(range(e)), list(range(d))
+        logger.info(
+            f"Copied encoder layers {e_layers_to_copy} and decoder layers {d_layers_to_copy}. Saving them to {save_path}"
+        )
+        student.save_pretrained(save_path)
+        return student, e_layers_to_copy, d_layers_to_copy
+    # Decide which layers of the teacher to copy. Not exactly alternating -- we try to keep first and last layer.
+    e_layers_to_copy: List[int] = pick_layers_to_copy(e, teacher_e)
+    d_layers_to_copy: List[int] = pick_layers_to_copy(d, teacher_d)
+    try:
+        copy_layers(teacher.model.encoder.layers, student.model.encoder.layers, e_layers_to_copy)
+        copy_layers(teacher.model.decoder.layers, student.model.decoder.layers, d_layers_to_copy)
+    except AttributeError:  # For t5, student.model.encoder.layers is called student.encoder.block
+        copy_layers(teacher.encoder.block, student.encoder.block, e_layers_to_copy)
+        copy_layers(teacher.decoder.block, student.decoder.block, d_layers_to_copy)
+    logger.info(
+        f"Copied encoder layers {e_layers_to_copy} and decoder layers {d_layers_to_copy}. Saving them to {save_path}"
+    )
+    student.config.init_metadata = dict(
+        teacher_type=teacher.config.model_type,
+        copied_encoder_layers=e_layers_to_copy,
+        copied_decoder_layers=d_layers_to_copy,
+    )
+    student.save_pretrained(save_path)
+    # Save information about copying for easier reproducibility
+    return student, e_layers_to_copy, d_layers_to_copy
+if __name__ == "__main__":
+    fire.Fire(create_student_by_copying_alternating_layers)
--- a/examples/seq2seq/save_randomly_initialized_model.py
+++ b/examples/seq2seq/save_randomly_initialized_model.py
+#!/usr/bin/env python
+import fire
+from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer
+def save_randomly_initialized_version(config_name: str, save_dir: str, **config_kwargs):
+    """Save a randomly initialized version of a model using a pretrained config.
+    Args:
+        config_name: which config to use
+        save_dir: where to save the resulting model and tokenizer
+        config_kwargs: Passed to AutoConfig
+    Usage::
+        save_randomly_initialized_version("facebook/bart-large-cnn", "distilbart_random_cnn_6_3", encoder_layers=6, decoder_layers=3, num_beams=3)
+    """
+    cfg = AutoConfig.from_pretrained(config_name, **config_kwargs)
+    model = AutoModelForSeq2SeqLM.from_config(cfg)
+    model.save_pretrained(save_dir)
+    AutoTokenizer.from_pretrained(config_name).save_pretrained(save_dir)
+    return model
+if __name__ == "__main__":
+    fire.Fire(save_randomly_initialized_version)
--- a/examples/seq2seq/test_data/wmt_en_ro/train.len
+++ b/examples/seq2seq/test_data/wmt_en_ro/train.len
--- a/examples/seq2seq/test_data/wmt_en_ro/val.len
+++ b/examples/seq2seq/test_data/wmt_en_ro/val.len
--- a/examples/seq2seq/test_make_student.py
+++ b/examples/seq2seq/test_make_student.py
+import tempfile
+import unittest
+from make_student import create_student_by_copying_alternating_layers
+from transformers import AutoConfig
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_torch
+TINY_BART = "sshleifer/bart-tiny-random"
+TINY_T5 = "patrickvonplaten/t5-tiny-random"
+@require_torch
+class MakeStudentTester(unittest.TestCase):
+    @cached_property
+    def teacher_config(self):
+        return AutoConfig.from_pretrained(TINY_BART)
+    def test_valid_t5(self):
+        student, *_ = create_student_by_copying_alternating_layers(TINY_T5, tempfile.mkdtemp(), e=1, d=1)
+        self.assertEqual(student.config.num_hidden_layers, 1)
+    def test_invalid_t5(self):
+        # T5 students must have the same e==d because there is only one config property
+        with self.assertRaises(AssertionError):
+            student, *_ = create_student_by_copying_alternating_layers(TINY_T5, tempfile.mkdtemp(), e=1, d=None)
+    def test_same_decoder_small_encoder(self):
+        student, *_ = create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=1, d=None)
+        self.assertEqual(student.config.encoder_layers, 1)
+        self.assertEqual(student.config.decoder_layers, self.teacher_config.encoder_layers)
+    def test_small_enc_small_dec(self):
+        student, *_ = create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=1, d=1)
+        self.assertEqual(student.config.encoder_layers, 1)
+        self.assertEqual(student.config.decoder_layers, 1)
+    def test_raises_assert(self):
+        with self.assertRaises(AssertionError):
+            create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=None, d=None)
--- a/examples/seq2seq/train_distilbart_xsum.sh
+++ b/examples/seq2seq/train_distilbart_xsum.sh
 #!/usr/bin/env bash
 export PYTHONPATH="../":"${PYTHONPATH}"
-export BS=16
-export GAS=2
 python distillation.py \
+  --teacher facebook/bart-large-xsum --data_dir xsum \
+  --student_decoder_layers 6 --student_encoder_layers 12 \
+  --freeze_encoder --freeze_embeds \
  --learning_rate=3e-4 \
  --do_train \
  --do_predict \
-  --fp16 \
+  --fp16 --fp16_opt_level=O1 \
-  --val_check_interval 0.1 --n_val 1000 \
+  --val_check_interval 0.1 --n_val 1000 --eval_beams 2 --length_penalty=0.5 \
-  --teacher facebook/bart-large-xsum --data_dir $XSUM_DIR \
  --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \
-  --student_decoder_layers 6 --student_encoder_layers 12 \
-  --freeze_encoder --freeze_embeds \
  --model_name_or_path IGNORED \
-  --alpha_hid=3. --length_penalty=0.5 \
+  --alpha_hid=3. \
-  --train_batch_size=$BS --eval_batch_size=$BS --gradient_accumulation_steps=$GAS --num_train_epochs=6 \
+  --train_batch_size=16 --eval_batch_size=16 --gradient_accumulation_steps=2 \
-  --tokenizer_name facebook/bart-large \
+  --sortish_sampler \
+  --num_train_epochs=6 \
  --warmup_steps 500 \
  --output_dir distilbart_xsum_12_6 \
  "$@"