Unverified Commit c38779be authored by Sudhakar Singh's avatar Sudhakar Singh Committed by GitHub
Browse files

Llama accelerate tutorial (#720)



* tutorial and doc fixes
Signed-off-by: default avatarSudhakar Singh <sudhakars@nvidia.com>

* remove extra code
Signed-off-by: default avatarSudhakar Singh <sudhakars@nvidia.com>

* fix typos
Signed-off-by: default avatarSudhakar Singh <sudhakars@nvidia.com>

---------
Signed-off-by: default avatarSudhakar Singh <sudhakars@nvidia.com>
parent 965803c9
......@@ -21,12 +21,12 @@ from transformers.utils import WEIGHTS_INDEX_NAME
from transformers.utils.hub import get_checkpoint_shard_files
@contextmanager
def replace_decoder(te_decodder_cls):
def replace_decoder(te_decoder_cls):
"""
Replace `LlamaDecoderLayer` with custom `TELlamaDecoderLayer`.
"""
original_llama_decoder_cls = transformers.models.llama.modeling_llama.LlamaDecoderLayer
transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decodder_cls
transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decoder_cls
try:
yield
finally:
......@@ -56,6 +56,7 @@ class TELlamaDecoderLayer(te.pytorch.TransformerLayer):
normalization="RMSNorm",
activation="swiglu",
attn_input_format="bshd",
num_gqa_groups=config.num_key_value_heads,
)
te_rope = RotaryPositionEmbedding(config.hidden_size//config.num_attention_heads)
self.te_rope_emb = te_rope(max_seq_len=config.max_position_embeddings).cuda()
......@@ -84,7 +85,7 @@ class TELlamaForCausalLM:
"""
def __new__(cls, config: LlamaConfig):
with replace_decoder(te_decodder_cls=TELlamaDecoderLayer):
with replace_decoder(te_decoder_cls=TELlamaDecoderLayer):
llama_for_causal_lm = LlamaForCausalLM(config)
return llama_for_causal_lm
......
......@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "markdown",
"id": "1f37565e",
"id": "2cac9d39",
"metadata": {},
"source": [
"# Accelerating a Hugging Face Llama 2 model with Transformer Engine\n",
......@@ -11,14 +11,14 @@
"\n",
"<b>Goal</b>\n",
"\n",
"This tutorial showcases how accelerate finetuning a full Llama 2 model from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b-hf) by using `TransformerLayer` from the [Transformer Engine library](https://github.com/NVIDIA/TransformerEngine) in `BF16` and `FP8` precisions.\n",
"This tutorial showcases how to accelerate finetuning a full Llama 2 model from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b-hf) by using `TransformerLayer` from the [Transformer Engine library](https://github.com/NVIDIA/TransformerEngine) in `BF16` and `FP8` precisions.\n",
"\n",
"</div>\n"
]
},
{
"cell_type": "markdown",
"id": "ab4c0b82",
"id": "401f7fb1",
"metadata": {},
"source": [
"## Dependencies for this tutorial\n",
......@@ -35,7 +35,7 @@
},
{
"cell_type": "markdown",
"id": "466ff515",
"id": "33bdb5fe",
"metadata": {},
"source": [
"## Table of contents\n",
......@@ -53,7 +53,7 @@
},
{
"cell_type": "markdown",
"id": "8e84bcaa",
"id": "7645f176",
"metadata": {},
"source": [
"## From \"Transformer\" to \"Llama\" \n",
......@@ -89,7 +89,7 @@
},
{
"cell_type": "markdown",
"id": "e31303c7",
"id": "d0cfa787",
"metadata": {},
"source": [
"## Hugging Face's `LlamaModel`\n",
......@@ -166,7 +166,7 @@
},
{
"cell_type": "markdown",
"id": "686df4ef",
"id": "f4f21369",
"metadata": {},
"source": [
"## [Baseline] Running HF `LlamaModel` (Precision: `BF16`)\n",
......@@ -190,7 +190,7 @@
},
{
"cell_type": "markdown",
"id": "107a8146",
"id": "24a8d0a5",
"metadata": {},
"source": [
"<div class=\"alert alert-info\">\n",
......@@ -206,8 +206,8 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "975f9184",
"execution_count": 1,
"id": "e36ff380",
"metadata": {},
"outputs": [
{
......@@ -215,7 +215,7 @@
"output_type": "stream",
"text": [
"10 finetuning steps complete!\n",
"Average time taken per step: 289 milliseconds\n"
"Average time taken per step: 315 milliseconds\n"
]
}
],
......@@ -247,19 +247,19 @@
},
{
"cell_type": "markdown",
"id": "c2d5b174",
"id": "a64f0f33",
"metadata": {},
"source": [
"Let's add this information in a table and keep comparing it with a few possible improvements in future sections:\n",
"\n",
"| Models | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
"|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
"| HF (baseline) | BF16 | 289 | 1 |"
"| HF (baseline) | BF16 | 315 | 1 |"
]
},
{
"cell_type": "markdown",
"id": "a7d436bf",
"id": "d9898383",
"metadata": {},
"source": [
"## [Improvement 1] Replace HF's `LlamaDecoderLayer` with TE's `TransformerLayer` (Precision: `BF16`)\n",
......@@ -322,6 +322,7 @@
" normalization=\"RMSNorm\",\n",
" activation=\"swiglu\",\n",
" attn_input_format=\"bshd\",\n",
" num_gqa_groups=config.num_key_value_heads,\n",
" )\n",
" te_rope = RotaryPositionEmbedding(config.hidden_size//config.num_attention_heads)\n",
" self.te_rope_emb = te_rope(max_seq_len=config.max_position_embeddings).cuda()\n",
......@@ -339,10 +340,11 @@
"8. `fuse_qkv_params`: if set to True, TransformerLayer module exposes a single fused parameter for query-key-value. This enables optimizations such as QKV fusion without concatentations/splits and also enables the argument fuse_wgrad_accumulation.\n",
"9. `normalization`: type of normalization applied. Default is `LayerNorm`.\n",
"10. `activation`: type of activation used in the MLP block. Default is `gelu`.\n",
"11. `attn_input_format`: controls whether the dimensions of the intermediate hidden states is 'batch first' ('bshd') or 'sequence first' ('sbhd'). `s` stands for the sequence length, `b` batch size, `h` the number of heads, `d` head size. Note that these formats are very closely related to the `qkv_format` in the `MultiHeadAttention` and `DotProductAttention` modules. \n",
"11. `attn_input_format`: controls whether the dimensions of the intermediate hidden states is 'batch first' ('bshd') or 'sequence first' ('sbhd'). `s` stands for the sequence length, `b` batch size, `h` the number of heads, `d` head size. Note that these formats are very closely related to the `qkv_format` in the `MultiHeadAttention` and `DotProductAttention` modules.\n",
"12. `num_gqa_groups`: number of GQA groups in the transformer layer. Grouped Query Attention is described in [this paper](https://arxiv.org/pdf/2305.13245.pdf). This only affects the keys and values, not the querys. GQA-1 is equivalent to Multi-Query Attention ([MQA](https://arxiv.org/pdf/1911.02150.pdf)), while GQA-H is equivalent to MultiHead Attention, i.e. `num_gqa_groups = num_attention_heads`.\n",
"\n",
"\n",
"Further, note that `RotaryPositionEmbedding` is defined as part of the TE's `TransformerLayer` itself since it expects this rope cache if RoPE is used in the model. \n",
"Further, note that `RotaryPositionEmbedding` is defined as part of the `TELlamaDecoderLayer` (wrapper around TE's `TransformerLayer`) itself since it expects this rope cache if RoPE is used in the model. \n",
"\n",
"Let's revisit how `LlamaDecoderLayer`s form the core of the decoder layer stack in HF's llama implementation:\n",
"```\n",
......@@ -422,12 +424,12 @@
"\n",
"```\n",
"@contextmanager\n",
"def replace_decoder(te_decodder_cls):\n",
"def replace_decoder(te_decoder_cls):\n",
" \"\"\"\n",
" Replace `LlamaDecoderLayer` with custom `TELlamaDecoderLayer`.\n",
" \"\"\"\n",
" original_llama_decoder_cls = transformers.models.llama.modeling_llama.LlamaDecoderLayer\n",
" transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decodder_cls\n",
" transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decoder_cls\n",
" try:\n",
" yield\n",
" finally:\n",
......@@ -446,7 +448,7 @@
" \"\"\"\n",
"\n",
" def __new__(cls, config: LlamaConfig):\n",
" with replace_decoder(te_decodder_cls=TELlamaDecoderLayer):\n",
" with replace_decoder(te_decoder_cls=TELlamaDecoderLayer):\n",
" llama_for_causal_lm = LlamaForCausalLM(config)\n",
" return llama_for_causal_lm\n",
".\n",
......@@ -530,7 +532,7 @@
{
"cell_type": "code",
"execution_count": 1,
"id": "48dc8935",
"id": "4974b738",
"metadata": {},
"outputs": [
{
......@@ -538,7 +540,7 @@
"output_type": "stream",
"text": [
"10 finetuning steps complete!\n",
"Average time taken per step: 242 milliseconds\n"
"Average time taken per step: 252 milliseconds\n"
]
}
],
......@@ -570,20 +572,20 @@
},
{
"cell_type": "markdown",
"id": "3c3d228a",
"id": "85c78c7f",
"metadata": {},
"source": [
"Compared to the \"baseline\" implementation, we see that using Transformer Engine's `TransformerLayer` in place of Huggging Face's `LlamaDecoderLayer` gives a speedup of **19%** even when using only BF16 precision!\n",
"Compared to the \"baseline\" implementation, we see that using Transformer Engine's `TransformerLayer` in place of Huggging Face's `LlamaDecoderLayer` gives a speedup of **25%** even when using only BF16 precision!\n",
"\n",
"| Models | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
"|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
"| HF (baseline) | BF16 | 289 | 1 |\n",
"| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16 | 242 | 1.19 |"
"| HF (baseline) | BF16 | 315 | 1 |\n",
"| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16 | 252 | 1.25 |"
]
},
{
"cell_type": "markdown",
"id": "b92d6792",
"id": "e2fb88e9",
"metadata": {},
"source": [
"## [Improvement 2] Replace HF's `LlamaDecoderLayer` with TE's `TransformerLayer` (Precision: `FP8`)\n",
......@@ -608,8 +610,8 @@
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6bba7cc1",
"execution_count": 1,
"id": "8f2b752e",
"metadata": {},
"outputs": [
{
......@@ -617,7 +619,7 @@
"output_type": "stream",
"text": [
"10 finetuning steps complete!\n",
"Average time taken per step: 231 milliseconds\n"
"Average time taken per step: 226 milliseconds\n"
]
}
],
......@@ -649,27 +651,27 @@
},
{
"cell_type": "markdown",
"id": "602239d7",
"id": "67ec126c",
"metadata": {},
"source": [
"| Models | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
"|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
"| HF (baseline) | BF16 | 289 | 1 |\n",
"| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16 | 242 | 1.19 |\n",
"| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | FP8 | 231 | 1.25 |\n",
"| HF (baseline) | BF16 | 315 | 1 |\n",
"| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | BF16 | 252 | 1.25 |\n",
"| TE (replace `LlamaDecoderLayer` with `TE.TransformerLayer`) | FP8 | 226 | 1.39 |\n",
"\n",
"\n",
"After turning on FP8 precision, we get even more speedup of **25%**!"
"After turning on FP8 precision, we get even more speedup of almost **40%**!"
]
},
{
"cell_type": "markdown",
"id": "372867d5",
"id": "41b80b0f",
"metadata": {},
"source": [
"## Conclusion\n",
"\n",
"Using `TransformerLayer` module from Transformer Engine as a substitute for Hugging Face's `LlamaDecoderLayer` provides speedup over Hugging Face's native Llama 2 implementation. This needs careful initializing of model such that the model weights (which are meant for `LlamaDecoderLayer`) are correctly mapped to their counterparts in TE's `TransformerLayer`. Even with `BF16` precision, `TransformerLayer` provides a speedup over the baseline implementation. With `FP8` precision, the speed up is even more pronounced!"
"Using `TransformerLayer` module from Transformer Engine as a substitute for Hugging Face's `LlamaDecoderLayer` provides a speedup over Hugging Face's native Llama 2 implementation. This needs careful initialization of the model such that the model weights (which are meant for `LlamaDecoderLayer`) are correctly mapped to their counterparts in TE's `TransformerLayer`. Even with `BF16` precision, `TransformerLayer` provides a speedup over the baseline implementation. With `FP8` precision, the speed up is even more pronounced!"
]
}
],
......
......@@ -26,8 +26,10 @@ class HyperParameters:
self.batch_size = 8
self.max_seq_length = 256
self.gradient_accumulation_steps = 1
self.num_warmup_steps=5
self.num_training_steps=10
hyperparams = HyperParameters()
def get_dataloaders(accelerator:Accelerator, hyperparams):
......@@ -132,11 +134,9 @@ def finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer,
optimizer.zero_grad()
train_dataloader = enumerate(train_dataloader)
time_vals = []
for _ in range(hyperparams.num_training_steps):
# Warmup iters
for _ in range(hyperparams.num_warmup_steps):
step, batch = next(train_dataloader)
start_time = time.time()
with accelerator.accumulate(model):
outputs = model(**batch)
loss = outputs.loss
......@@ -146,15 +146,28 @@ def finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer,
lr_scheduler.step()
optimizer.zero_grad()
end_time = time.time()
total_time = end_time - start_time
time_vals.append(total_time)
# Get the timers ready
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()
start.record()
# Training iters
for _ in range(hyperparams.num_training_steps):
step, batch = next(train_dataloader)
with accelerator.accumulate(model):
outputs = model(**batch)
loss = outputs.loss
total_loss += loss.detach().float()
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
torch.cuda.synchronize()
end.record()
accelerator.end_training()
# ignore the first couple of time vals
time_vals = time_vals[2:]
print(f"{hyperparams.num_training_steps} finetuning steps complete!\nAverage time taken per step: {(sum(time_vals)/len(time_vals)) * 1000:.0f} milliseconds")
print(f"{hyperparams.num_training_steps} finetuning steps complete!\nAverage time taken per step: {(start.elapsed_time(end)/hyperparams.num_training_steps):.0f} milliseconds")
def restart_jupyter_notebook():
# Try restarting the Jupyter kernel
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment