Llama accelerate tutorial (#720)

* tutorial and doc fixes Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> * remove extra code Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> * fix typos Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> --------- Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com>

Llama accelerate tutorial (#720)
* tutorial and doc fixes Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> * remove extra code Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> * fix typos Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> --------- Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com>
c38779be · Sudhakar Singh · GitHub · 965803c9 · c38779be · c38779be
Unverified Commit c38779be authored Mar 20, 2024 by Sudhakar Singh Committed by GitHub Mar 20, 2024
3 changed files
--- a/docs/examples/te_llama/te_llama.py
+++ b/docs/examples/te_llama/te_llama.py
@@ -21,12 +21,12 @@ from transformers.utils import WEIGHTS_INDEX_NAME
 from transformers.utils.hub import get_checkpoint_shard_files

 @contextmanager
-def replace_decoder(te_decodder_cls):
+def replace_decoder(te_decoder_cls):
    """
    Replace `LlamaDecoderLayer` with custom `TELlamaDecoderLayer`.
    """
    original_llama_decoder_cls = transformers.models.llama.modeling_llama.LlamaDecoderLayer
-    transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decodder_cls
+    transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decoder_cls
    try:
        yield
    finally:
@@ -56,6 +56,7 @@ class TELlamaDecoderLayer(te.pytorch.TransformerLayer):
            normalization="RMSNorm",
            activation="swiglu",
            attn_input_format="bshd",
+            num_gqa_groups=config.num_key_value_heads,
        )
        te_rope = RotaryPositionEmbedding(config.hidden_size//config.num_attention_heads)
        self.te_rope_emb = te_rope(max_seq_len=config.max_position_embeddings).cuda()
@@ -84,7 +85,7 @@ class TELlamaForCausalLM:
    """

    def __new__(cls, config: LlamaConfig):
-        with replace_decoder(te_decodder_cls=TELlamaDecoderLayer):
+        with replace_decoder(te_decoder_cls=TELlamaDecoderLayer):
            llama_for_causal_lm = LlamaForCausalLM(config)
        return llama_for_causal_lm


--- a/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb
+++ b/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "markdown",
-   "id": "1f37565e",
+   "id": "2cac9d39",
   "metadata": {},
   "source": [
    "# Accelerating a Hugging Face Llama 2 model with Transformer Engine\n",
@@ -11,14 +11,14 @@
    "\n",
    "<b>Goal</b>\n",
    "\n",
-    "This tutorial showcases how accelerate finetuning a full Llama 2 model from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b-hf) by using `TransformerLayer` from the [Transformer Engine library](https://github.com/NVIDIA/TransformerEngine) in `BF16` and `FP8` precisions.\n",
+    "This tutorial showcases how to accelerate finetuning a full Llama 2 model from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b-hf) by using `TransformerLayer` from the [Transformer Engine library](https://github.com/NVIDIA/TransformerEngine) in `BF16` and `FP8` precisions.\n",
    "\n",
    "</div>\n"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "ab4c0b82",
+   "id": "401f7fb1",
   "metadata": {},
   "source": [
    "## Dependencies for this tutorial\n",
@@ -35,7 +35,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "466ff515",
+   "id": "33bdb5fe",
   "metadata": {},
   "source": [
    "## Table of contents\n",
@@ -53,7 +53,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "8e84bcaa",
+   "id": "7645f176",
   "metadata": {},
   "source": [
    "## From \"Transformer\" to \"Llama\" \n",
@@ -89,7 +89,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "e31303c7",
+   "id": "d0cfa787",
   "metadata": {},
   "source": [
    "## Hugging Face's `LlamaModel`\n",
@@ -166,7 +166,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "686df4ef",
+   "id": "f4f21369",
   "metadata": {},
   "source": [
    "## [Baseline] Running HF `LlamaModel` (Precision: `BF16`)\n",
@@ -190,7 +190,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "107a8146",
+   "id": "24a8d0a5",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
@@ -206,8 +206,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "id": "975f9184",
+   "execution_count": 1,
+   "id": "e36ff380",
   "metadata": {},
   "outputs": [
    {
@@ -215,7 +215,7 @@
     "output_type": "stream",
     "text": [
      "10 finetuning steps complete!\n",
-      "Average time taken per step: 289 milliseconds\n"
+      "Average time taken per step: 315 milliseconds\n"
     ]
    }
   ],
@@ -247,19 +247,19 @@
  },
  {
   "cell_type": "markdown",
-   "id": "c2d5b174",
+   "id": "a64f0f33",
   "metadata": {},
   "source": [
    "Let's add this information in a table and keep comparing it with a few possible improvements in future sections:\n",
    "\n",
    "| Models                                                      | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
    "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
-    "| HF (baseline)                                               | BF16      | 289                         | 1                       |"
+    "| HF (baseline)                                               | BF16      | 315                         | 1                       |"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "a7d436bf",
+   "id": "d9898383",
   "metadata": {},
   "source": [
    "## [Improvement 1] Replace HF's `LlamaDecoderLayer` with TE's `TransformerLayer` (Precision: `BF16`)\n",
@@ -322,6 +322,7 @@
    "            normalization=\"RMSNorm\",\n",
    "            activation=\"swiglu\",\n",
    "            attn_input_format=\"bshd\",\n",
+    "            num_gqa_groups=config.num_key_value_heads,\n",
    "        )\n",
    "        te_rope = RotaryPositionEmbedding(config.hidden_size//config.num_attention_heads)\n",
    "        self.te_rope_emb = te_rope(max_seq_len=config.max_position_embeddings).cuda()\n",
@@ -339,10 +340,11 @@
    "8. `fuse_qkv_params`:  if set to True, TransformerLayer module exposes a single fused parameter for query-key-value. This enables optimizations such as QKV fusion without concatentations/splits and also enables the argument fuse_wgrad_accumulation.\n",
    "9. `normalization`: type of normalization applied. Default is `LayerNorm`.\n",
    "10. `activation`: type of activation used in the MLP block. Default is `gelu`.\n",
-    "11. `attn_input_format`: controls whether the dimensions of the intermediate hidden states is 'batch first' ('bshd') or 'sequence first' ('sbhd'). `s` stands for the sequence length, `b` batch size, `h` the number of heads, `d` head size. Note that these formats are very closely related to the `qkv_format` in the `MultiHeadAttention` and `DotProductAttention` modules. \n",
+    "11. `attn_input_format`: controls whether the dimensions of the intermediate hidden states is 'batch first' ('bshd') or 'sequence first' ('sbhd'). `s` stands for the sequence length, `b` batch size, `h` the number of heads, `d` head size. Note that these formats are very closely related to the `qkv_format` in the `MultiHeadAttention` and `DotProductAttention` modules.\n",
+    "12. `num_gqa_groups`: number of GQA groups in the transformer layer. Grouped Query Attention is described in [this paper](https://arxiv.org/pdf/2305.13245.pdf). This only affects the keys and values, not the querys. GQA-1 is equivalent to Multi-Query Attention ([MQA](https://arxiv.org/pdf/1911.02150.pdf)), while GQA-H is equivalent to MultiHead Attention, i.e. `num_gqa_groups = num_attention_heads`.\n",
    "\n",
    "\n",
-    "Further, note that `RotaryPositionEmbedding` is defined as part of the TE's `TransformerLayer` itself since it expects this rope cache if RoPE is used in the model. \n",
+    "Further, note that `RotaryPositionEmbedding` is defined as part of the `TELlamaDecoderLayer` (wrapper around TE's `TransformerLayer`) itself since it expects this rope cache if RoPE is used in the model. \n",
    "\n",
    "Let's revisit how `LlamaDecoderLayer`s form the core of the decoder layer stack in HF's llama implementation:\n",
    "```\n",
@@ -422,12 +424,12 @@
    "\n",
    "```\n",
    "@contextmanager\n",
-    "def replace_decoder(te_decodder_cls):\n",
+    "def replace_decoder(te_decoder_cls):\n",
    "    \"\"\"\n",
    "    Replace `LlamaDecoderLayer` with custom `TELlamaDecoderLayer`.\n",
    "    \"\"\"\n",
    "    original_llama_decoder_cls = transformers.models.llama.modeling_llama.LlamaDecoderLayer\n",
-    "    transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decodder_cls\n",
+    "    transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decoder_cls\n",
    "    try:\n",
    "        yield\n",
    "    finally:\n",
@@ -446,7 +448,7 @@
    "    \"\"\"\n",
    "\n",
    "    def __new__(cls, config: LlamaConfig):\n",
-    "        with replace_decoder(te_decodder_cls=TELlamaDecoderLayer):\n",
+    "        with replace_decoder(te_decoder_cls=TELlamaDecoderLayer):\n",
    "            llama_for_causal_lm = LlamaForCausalLM(config)\n",
    "        return llama_for_causal_lm\n",
    ".\n",
@@ -530,7 +532,7 @@
  {
   "cell_type": "code",
   "execution_count": 1,
-   "id": "48dc8935",
+   "id": "4974b738",
   "metadata": {},
   "outputs": [
    {
@@ -538,7 +540,7 @@
     "output_type": "stream",
     "text": [
      "10 finetuning steps complete!\n",
-      "Average time taken per step: 242 milliseconds\n"
+      "Average time taken per step: 252 milliseconds\n"
     ]
    }
   ],
@@ -570,20 +572,20 @@
  },
  {
   "cell_type": "markdown",
-   "id": "3c3d228a",
+   "id": "85c78c7f",
   "metadata": {},
   "source": [
-    "Compared to the \"baseline\" implementation, we see that using Transformer Engine's `TransformerLayer` in place of Huggging Face's `LlamaDecoderLayer` gives a speedup of **19%** even when using only BF16 precision!\n",
+    "Compared to the \"baseline\" implementation, we see that using Transformer Engine's `TransformerLayer` in place of Huggging Face's `LlamaDecoderLayer` gives a speedup of **25%** even when using only BF16 precision!\n",
    "\n",
    "| Models                                                      | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
    "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
-    "| HF (baseline)                                               | BF16      | 289                         | 1                       |\n",
-    "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16      | 242                         | 1.19                    |"
+    "| HF (baseline)                                               | BF16      | 315                         | 1                       |\n",
+    "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16      | 252                         | 1.25                    |"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "b92d6792",
+   "id": "e2fb88e9",
   "metadata": {},
   "source": [
    "## [Improvement 2] Replace HF's `LlamaDecoderLayer` with TE's `TransformerLayer` (Precision: `FP8`)\n",
@@ -608,8 +610,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
-   "id": "6bba7cc1",
+   "execution_count": 1,
+   "id": "8f2b752e",
   "metadata": {},
   "outputs": [
    {
@@ -617,7 +619,7 @@
     "output_type": "stream",
     "text": [
      "10 finetuning steps complete!\n",
-      "Average time taken per step: 231 milliseconds\n"
+      "Average time taken per step: 226 milliseconds\n"
     ]
    }
   ],
@@ -649,27 +651,27 @@
  },
  {
   "cell_type": "markdown",
-   "id": "602239d7",
+   "id": "67ec126c",
   "metadata": {},
   "source": [
    "| Models                                                      | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
    "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
-    "| HF (baseline)                                               | BF16      | 289                         | 1                       |\n",
-    "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16      | 242                         | 1.19                    |\n",
-    "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | FP8       | 231                         | 1.25                    |\n",
+    "| HF (baseline)                                               | BF16      | 315                         | 1                       |\n",
+    "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16      | 252                         | 1.25                    |\n",
+    "| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | FP8       | 226                         | 1.39                    |\n",
    "\n",
    "\n",
-    "After turning on FP8 precision, we get even more speedup of **25%**!"
+    "After turning on FP8 precision, we get even more speedup of almost **40%**!"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "372867d5",
+   "id": "41b80b0f",
   "metadata": {},
   "source": [
    "## Conclusion\n",
    "\n",
-    "Using `TransformerLayer` module from Transformer Engine as a substitute for Hugging Face's `LlamaDecoderLayer` provides speedup over Hugging Face's native Llama 2 implementation. This needs careful initializing of model such that the model weights (which are meant for `LlamaDecoderLayer`) are correctly mapped to their counterparts in TE's `TransformerLayer`. Even with `BF16` precision, `TransformerLayer` provides a speedup over the baseline implementation. With `FP8` precision, the speed up is even more pronounced!"
+    "Using `TransformerLayer` module from Transformer Engine as a substitute for Hugging Face's `LlamaDecoderLayer` provides a speedup over Hugging Face's native Llama 2 implementation. This needs careful initialization of the model such that the model weights (which are meant for `LlamaDecoderLayer`) are correctly mapped to their counterparts in TE's `TransformerLayer`. Even with `BF16` precision, `TransformerLayer` provides a speedup over the baseline implementation. With `FP8` precision, the speed up is even more pronounced!"
   ]
  }
 ],

--- a/docs/examples/te_llama/utils.py
+++ b/docs/examples/te_llama/utils.py
@@ -26,8 +26,10 @@ class HyperParameters:
        self.batch_size = 8
        self.max_seq_length = 256
        self.gradient_accumulation_steps = 1
+        self.num_warmup_steps=5
        self.num_training_steps=10
        
+
 hyperparams = HyperParameters()

 def get_dataloaders(accelerator:Accelerator, hyperparams):
@@ -132,11 +134,9 @@ def finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer,
    optimizer.zero_grad()
    train_dataloader = enumerate(train_dataloader)

-    time_vals = []
-
-    for _ in range(hyperparams.num_training_steps):
+    # Warmup iters
+    for _ in range(hyperparams.num_warmup_steps):
        step, batch = next(train_dataloader)
-        start_time = time.time()
        with accelerator.accumulate(model):
            outputs = model(**batch)
            loss = outputs.loss
@@ -146,15 +146,28 @@ def finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer,
            lr_scheduler.step()
            optimizer.zero_grad()

-        end_time = time.time()
-        total_time = end_time - start_time
-        time_vals.append(total_time)
+    # Get the timers ready
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()

+    start.record()
+    # Training iters
+    for _ in range(hyperparams.num_training_steps):
+        step, batch = next(train_dataloader)
+        with accelerator.accumulate(model):
+            outputs = model(**batch)
+            loss = outputs.loss
+            total_loss += loss.detach().float()
+            accelerator.backward(loss)
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+    torch.cuda.synchronize()
+    end.record()
    accelerator.end_training()

-    # ignore the first couple of time vals
-    time_vals = time_vals[2:]
-    print(f"{hyperparams.num_training_steps} finetuning steps complete!\nAverage time taken per step: {(sum(time_vals)/len(time_vals)) * 1000:.0f} milliseconds")
+    print(f"{hyperparams.num_training_steps} finetuning steps complete!\nAverage time taken per step: {(start.elapsed_time(end)/hyperparams.num_training_steps):.0f} milliseconds")

 def restart_jupyter_notebook():
    # Try restarting the Jupyter kernel