Commit 523ec9cc authored by wangsen's avatar wangsen
Browse files

all

parents
Pipeline #1668 failed with stages
in 0 seconds
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""
Exports:
- RetroConfig: configuration dataclass for RetroModel.
- RetroModel: The Retro model.
- get_retro_decoder_block_spec: Get spec for Retro decoder transformer block.
"""
from .config import RetroConfig
from .decoder_spec import get_retro_decoder_block_spec
from .model import RetroModel
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Base class for decoder and encoder attention modules."""
from megatron.core.models.retro.config import RetroConfig
from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.module import MegatronModule
class BaseRetroCrossAttention(MegatronModule):
"""Base class for Retro cross attention, for both encoder & decoder layers.
This class collects the retro arguments below (i.e., num neighbors, chunk
length, and retrieve length) for use in Retro's custom cross attention
operators.
Args:
config (RetroConfig): Retro config.
submodules (CrossAttentionSubmodules): Cross attention submodules.
layer_number (int): Layer number within transformer block.
attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
"""
def __init__(
self,
config: RetroConfig,
submodules: CrossAttentionSubmodules,
layer_number: int = 1,
attn_mask_type: AttnMaskType = AttnMaskType.padding,
):
super().__init__(config=config)
self.attn = CrossAttention(
config=config,
submodules=submodules,
layer_number=layer_number,
attn_mask_type=attn_mask_type,
)
self.retro_num_neighbors = config.retro_num_neighbors
self.retro_chunk_length = config.retro_chunk_length
self.retro_retrieved_length = config.retro_retrieved_length
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Configuration dataclass for a RetroModel."""
import os
import types
from dataclasses import dataclass
from importlib.metadata import version
from pkg_resources import packaging
from megatron.core.transformer import TransformerConfig
@dataclass
class RetroConfig(TransformerConfig):
"""Configuration object for Retro models. """
# Retro.
retro_project_dir: str = None
"""Retro project directory, which contains the preprocessed data for for pretraining. This
directory is built during preprocessing (see tools/retro/README.md), and contains
subdirectories for the chunk database and pretraining neighbors.
"""
retro_block_size: int = None
"""Number of records to load per data file, as saved during preprocessing. Block processing is
used for efficient data preprocessing.
"""
retro_chunk_length: int = None
"""Chunk length used for performing chunked- cross-attention (CCA)."""
retro_encoder_num_layers: int = 2
"""Number of layers to use for the retrieval encoder."""
retro_encoder_hidden_dropout: float = 0.1
"""Hidden dropout for retrieval encoder."""
retro_encoder_attention_dropout: float = 0.1
"""Attention dropout for retrieval encoder."""
retro_neighbor_dirs: dict = None
"""Directory names of saved neighbor id files for train, valid, and test datasets."""
retro_num_neighbors: int = 2
"""Number of neighbors to retrieve during pretraining."""
retro_num_retrieved_chunks: int = 2
"""Number of chunks to retrieve from the retrieval database."""
retro_retrieved_length: int = None
"""Cached value of retro_num_retrieved_chunks * retro_chunk_length (i.e., the total number of
retrieved tokens; neighbor + continuation).
"""
retro_split_preprocessing: str = None
"""Data split used during data preprocessing."""
retro_verify_neighbor_count: bool = True
"""Verify that len(GPT dataset) == len(saved neighbors)."""
def __post_init__(self) -> None:
"""Validate Retro config."""
super().__post_init__()
# Validate Transformer Engine version.
te_version = packaging.version.Version(version("transformer-engine"))
if te_version >= packaging.version.Version("1.3"):
try:
assert os.getenv("NVTE_FLASH_ATTN") == "0"
assert os.getenv("NVTE_FUSED_ATTN") == "0"
except Exception as e:
raise Exception(
"When using Transformer Engine >= 1.3, environment vars NVTE_FLASH_ATTN and NVTE_FUSED_ATTN most both be defined and set to '0'. Currently, NVTE_FLASH_ATTN == %s, NVTE_FUSED_ATTN == %s."
% (
os.getenv("NVTE_FLASH_ATTN", "[unset]"),
os.getenv("NVTE_FUSED_ATTN", "[unset]"),
)
)
# Preprocessing split should be defined.
assert self.retro_split_preprocessing is not None
# Pre-compute retrieved length.
self.retro_retrieved_length = self.retro_num_retrieved_chunks * self.retro_chunk_length
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Retro's cross attention modules for the decoder block."""
from functools import partial
from typing import Callable
import numpy as np
import torch
from torch import Tensor
from megatron.core import InferenceParams
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.models.retro.base_attention import BaseRetroCrossAttention
from megatron.core.models.retro.config import RetroConfig
from megatron.core.models.retro.utils import get_all_true_mask
from megatron.core.transformer import ModuleSpec
from megatron.core.transformer.attention import CrossAttentionSubmodules
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.module import MegatronModule
from megatron.core.transformer.transformer_block import TransformerBlock
class RetroDecoderCrossAttention(BaseRetroCrossAttention):
"""Retro decoder's chunked cross attention operator.
See this paper for more details: https://arxiv.org/abs/2112.04426.
Neighboring chunks retrieved from the chunk database are used here for
chunked-cross attention.
** Note about 'encoder_block_spec' **
Retro is an encoder-decoder model that uses its encoder for encoding
neighboring chunks that are retrieved from a chunk database. These
encoded neighbors are then used in the decoder stack for performing
chunked-cross attention (see paper link above).
In contrast to the T5 model, the encoder and decoder are computationally
intertwined, since the input to the encoder is the output of the self-
attention of the first decoder layer. As such, the encoder block itself
is instantiated within the first Retro decoder layer, in order to receive
the self-attention's output. (Note, that only the first decoder layer
instantiates an encoder block, and the remaining decoder layers use the
encoder output from the first decoder layer.)
Args:
config (RetroConfig): Retro config.
submodules (CrossAttentionSubmodules): Cross attention submodules.
layer_number (int): Layer number within transformer block.
attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
encoder_block_spec (ModuleSpec): The first Retro decoder layer is provided with a transformer block spec to construct the neighbor encoder.
"""
def __init__(
self,
config: RetroConfig,
submodules: CrossAttentionSubmodules,
layer_number: int = 1,
attn_mask_type: AttnMaskType = AttnMaskType.padding,
encoder_block_spec: ModuleSpec = None,
):
super().__init__(
config=config,
submodules=submodules,
layer_number=layer_number,
attn_mask_type=attn_mask_type,
)
if encoder_block_spec:
self.encoder = TransformerBlock(
config=config, spec=encoder_block_spec, pre_process=True, post_process=False,
)
# self._encoder_key = 'encoder' # ... necessary?
else:
self.encoder = None
def forward(
self,
hidden_states: Tensor,
attention_mask: Tensor,
key_value_states: Tensor = None,
inference_params: InferenceParams = None,
# rotary_pos_emb: Tensor = None, # ... unsupported for retro.
) -> dict:
"""Cross attention for Retro decoder.
Notation:
ns : Sequence length.
bs : Batch size.
d : Hidden size.
l : Number of chunks per sample (i.e., seq_length/chunk_length).
m : Number of tokens per chunk.
k : Number of neighbors.
r : Number of retrieved tokens (neighbors + continuation).
Args:
hidden_states (Tensor): Transformer layer hidden states.
attention_mask (Tensor): Attention mask.
key_value_states (Tensor): Neighbor embeddings if first decoder layer, else encoder output.
inference_params (InferenceParams): Inference params.
Returns:
A dict consisting of the attention output and context, along with other scalars necessary for performing the downstream bias-dropout-add.
"""
# hidden_states: [ ns, bs, d ]
# key_value_states: [ r, k*bs*l, d ]
ns, bs, d = hidden_states.shape
l = int(np.ceil(ns / self.retro_chunk_length))
# Retrieve neighbors.
if self.encoder:
# Sequence length remainder.
first_ns = ns % self.retro_chunk_length
# Case 1: Sequence length not divisible by chunk length.
if first_ns > 0:
# Split sequence into first partial chunk & remaining chunks.
first_chunk, rest_chunk = hidden_states[:first_ns], hidden_states[first_ns:]
# Pad partial chunk with zeros.
first_chunk = torch.nn.functional.pad(
first_chunk, (0, 0, 0, 0, 0, self.retro_chunk_length - first_ns), 'constant', 0,
)
# Concatenate padded chunk with remaining chunks.
chunked_output = torch.cat((first_chunk, rest_chunk), dim=0) # [ l*m, bs, d ]
# Case 2: Sequence length is divisible by chunk length.
else:
chunked_output = hidden_states # [ l*m, bs, d ]
# Chunk & permute hidden states.
# - hidden_states: [ l*m, bs, d ]
# - chunked_output: [ m, bs*l, d ]
chunked_output = (
chunked_output.reshape(l, self.retro_chunk_length, bs, d)
.permute(1, 2, 0, 3)
.reshape(self.retro_chunk_length, bs * l, d)
.contiguous()
)
# flash attn: [ b, h, sq, sk ]
# fused attn: [ b, 1, 1, sq ]
chunked_output_mask = get_all_true_mask(
size=(1, 1, chunked_output.shape[0], key_value_states.shape[0]),
device=chunked_output.device,
)
# Encode neighbors. (Note: 'key_value_states' re-assigned here.)
key_value_states = self.encoder(
hidden_states=key_value_states,
attention_mask=attention_mask,
context=chunked_output,
context_mask=chunked_output_mask,
inference_params=inference_params,
) # [ r, k*bs*l, d ]
key_value_states = key_value_states.reshape(
self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d
) # [ r*k, bs*l, d ]
# Attend starting at last token of first chunk.
pad = (ns - 1) % self.retro_chunk_length
attending_chunks = hidden_states[pad:]
# Pad attending tokens to sequence length.
padded_chunks = torch.nn.functional.pad(
attending_chunks, (0, 0, 0, 0, 0, self.retro_chunk_length - 1), 'constant', 0,
)
# Permute attending chunks.
# - padded_chunks: [ l*m, bs, d ]
# - padded_chunked_output: [ m, bs*l, d ] (matches 'chunked_output' above)
padded_chunked_output = padded_chunks.reshape(l, self.retro_chunk_length, bs, d).permute(
1, 2, 0, 3
)
padded_chunked_output = padded_chunked_output.reshape(
self.retro_chunk_length, bs * l, d
).contiguous()
# flash attn: [ b, h, sq, sk ]
# fused attn: [ b, 1, 1, sq ]
padded_chunked_output_mask = get_all_true_mask(
size=(1, 1, padded_chunked_output.shape[0], key_value_states.shape[0]),
device=padded_chunked_output.device,
)
# Attend to encoded neighbors.
attention_output, attention_bias = self.attn(
hidden_states=padded_chunked_output,
attention_mask=padded_chunked_output_mask,
key_value_states=key_value_states,
)
# Return dimensions for bias-dropout step.
return {
"ns": ns,
"bs": bs,
"d": d,
"l": l,
"pad": pad,
"attention_output": attention_output, # [ m, bs*l, d ]
"attention_bias": attention_bias, # [ d ]
"context": key_value_states, # [ r*k, bs*l, d ]
}
class RetroDecoderBiasDropoutAdd(MegatronModule):
"""Retro decoder's bias-dropout-add operator.
This operator takes care of reshaping and permuting the output from the
chunk dimension to the sequence dimension.
Args:
config (RetroConfig): Retro config.
"""
def __init__(
self, config: RetroConfig,
):
super().__init__(config=config)
self.retro_chunk_length = config.retro_chunk_length
@classmethod
def _forward(
cls,
x_with_bias: dict,
residual: Tensor,
prob: float,
retro_chunk_length: int,
bias_dropout_add: Callable,
) -> Tensor:
"""Per-chunk bias-dropout-add.
Args:
x_with_bias (dict): Attention output and bias, along with other Retro relevant parameters.
residual (Tensor): Transformer layer residual.
prob (float): Dropout probability.
retro_chunk_length (int): Retro chunk length (e.g., 64).
bias_dropout_add (Callable): Bias-dropout-add function.
Returns:
Output of bias-dropout-add.
"""
# Extract input dict.
ns = x_with_bias["ns"]
bs = x_with_bias["bs"]
d = x_with_bias["d"]
l = x_with_bias["l"]
pad = x_with_bias["pad"]
attention_output = x_with_bias["attention_output"] # [ m, bs*l, d ]
attention_bias = x_with_bias["attention_bias"] # [ d ]
# Re-enable torch grad to enable fused optimization.
with torch.enable_grad():
# Bias-dropout-add.
x = bias_dropout_add(
(
attention_output,
None if attention_bias is None else attention_bias.expand_as(attention_output),
),
torch.zeros_like(attention_output),
prob,
)
# Permute chunks back to sequence dimension.
# 1. [ m, bs*l, d ]
# 2. [ m, bs, l, d ]
# 3. [ l, m, bs, d ]
# 4. [ m*l, bs, d ] == [ ns, bs, d ]
x = (
x.reshape(retro_chunk_length, bs, l, d)
.permute(2, 0, 1, 3)
.reshape(retro_chunk_length * l, bs, d)
)
# Prepend zeros for non-attending tokens.
x = torch.nn.functional.pad(x, (0, 0, 0, 0, pad, 0), 'constant', 0,)[
:ns
] # [ ns, bs, d ]
# Add residual. [ ns, bs, d ]
x = x + residual
# Output. [ ns, bs, d ]
return x
def forward(self, training: bool, fused: bool) -> partial:
"""Retro decoder bias-dropout-add.
Args:
training (bool): If training, then apply dropout.
fused (bool): Fuse bias-dropout-add.
Returns:
The partial function for performing bias-dropout-add.
"""
return partial(
self._forward,
retro_chunk_length=self.retro_chunk_length,
bias_dropout_add=get_bias_dropout_add(training, fused),
)
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Specs for Retro decoder."""
import typing
from megatron.core import parallel_state
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.models.gpt.gpt_layer_specs import (
get_gpt_layer_local_spec,
get_gpt_layer_with_transformer_engine_spec,
)
from megatron.core.models.retro.config import RetroConfig
from megatron.core.models.retro.decoder_attention import (
RetroDecoderBiasDropoutAdd,
RetroDecoderCrossAttention,
)
from megatron.core.models.retro.encoder_spec import get_retro_encoder_block_spec
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer import ModuleSpec
from megatron.core.transformer.attention import CrossAttentionSubmodules
try:
from megatron.core.transformer.custom_layers.transformer_engine import (
TEColumnParallelLinear,
TEDotProductAttention,
TENorm,
TERowParallelLinear,
)
except ImportError:
#print("Do not support transformer_engine")
TEColumnParallelLinear = None
TEDotProductAttention = None
TENorm = None
TERowParallelLinear = None
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.transformer_block import (
TransformerBlockSubmodules,
get_num_layers_to_build,
)
def get_retro_decoder_layer_te_spec(
encoder_block_spec: typing.Union[ModuleSpec, TransformerBlockSubmodules, None] = None
) -> ModuleSpec:
"""Retro decoder TE spec (uses Transformer Engine components).
A Retro decoder layer uses custom attention and bias-dropout-add operators
to perform chunked-cross attention. Additionally, the first Retro decoder
layer instantiates an entire encoder transformer block. As such, the decoder
cross attention module takes an optional encoder block spec, which is only
provided for the first Retro decoder layer.
Args:
encoder_block_spec (ModuleSpec): Retro encoder block spec, to be provided for the first Retro decoder layer.
Returns:
A module spec with Transformer Engine modules.
"""
spec = get_gpt_layer_with_transformer_engine_spec()
spec.submodules.pre_cross_attn_layernorm = TENorm
spec.submodules.cross_attention = ModuleSpec(
module=RetroDecoderCrossAttention,
params={"encoder_block_spec": encoder_block_spec,},
submodules=CrossAttentionSubmodules(
linear_q=TEColumnParallelLinear,
linear_kv=TEColumnParallelLinear,
core_attention=TEDotProductAttention,
linear_proj=TERowParallelLinear,
),
)
spec.submodules.cross_attn_bda = ModuleSpec(module=RetroDecoderBiasDropoutAdd)
return spec
def get_retro_decoder_layer_local_spec(
encoder_block_spec: typing.Optional[ModuleSpec] = None,
) -> ModuleSpec:
"""Retro decoder local spec (uses Megatron-Core components).
A Retro decoder layer uses custom attention and bias-dropout-add operators
to perform chunked-cross attention. Additionally, the first Retro decoder
layer instantiates an entire encoder transformer block. As such, the decoder
cross attention module takes an optional encoder block spec, which is only
provided for the first Retro decoder layer.
Args:
encoder_block_spec (ModuleSpec): Retro encoder block spec, to be provided for the first Retro decoder layer.
Returns:
A module spec with local modules.
"""
spec = get_gpt_layer_local_spec()
spec.submodules.pre_cross_attn_layernorm = FusedLayerNorm
spec.submodules.cross_attention = ModuleSpec(
module=RetroDecoderCrossAttention,
params={"encoder_block_spec": encoder_block_spec,},
submodules=CrossAttentionSubmodules(
linear_q=ColumnParallelLinear,
linear_kv=ColumnParallelLinear,
core_attention=DotProductAttention,
linear_proj=RowParallelLinear,
),
)
spec.submodules.cross_attn_bda = ModuleSpec(module=RetroDecoderBiasDropoutAdd)
return spec
def get_retro_decoder_block_spec(
config: RetroConfig, use_transformer_engine: bool
) -> TransformerBlockSubmodules:
"""Retro decoder block spec.
Retro decoder block implementation details:
- The retro decoder block consists of interleaved GPT layers and customized Retro decoder layers.
- The Retro decoder layers are spaced three layers apart, and start on layer 6 or 9 (depending on the total number of layers).
- The first decoder layer instantiates an encoder block, and it therefore passes in an encoder_block_spec.
Args:
config (RetroConfig): Retro config.
use_transformer_engine (bool): If True, use Transformer Engine (instead of local modules.
Returns:
Transformer block submodules for the given spec.
"""
# Num layers.
assert (
parallel_state.get_pipeline_model_parallel_world_size() == 1
), "retro does not currently support pipeline parallelism."
assert (
parallel_state.get_virtual_pipeline_model_parallel_world_size() is None
), "retro does not currently support virtual pipeline parallelism."
num_layers = get_num_layers_to_build(config)
# Retro layer numbers.
retro_layer_start = 6 if num_layers <= 15 else 9
retro_layer_numbers = list(range(retro_layer_start, num_layers + 1, 3))
# Layer specs.
gpt_layer_spec = (
get_gpt_layer_with_transformer_engine_spec()
if use_transformer_engine
else get_gpt_layer_local_spec()
)
get_retro_decoder_layer_spec = (
get_retro_decoder_layer_te_spec
if use_transformer_engine
else get_retro_decoder_layer_local_spec
)
retro_layer_spec = get_retro_decoder_layer_spec()
retro_layer_spec_with_retriever = get_retro_decoder_layer_spec(
get_retro_encoder_block_spec(config, use_transformer_engine)
)
layer_specs = []
for layer_number in range(1, num_layers + 1):
if layer_number == retro_layer_numbers[0]:
layer_specs.append(retro_layer_spec_with_retriever)
elif layer_number in retro_layer_numbers:
layer_specs.append(retro_layer_spec)
else:
layer_specs.append(gpt_layer_spec)
# Block spec.
block_spec = TransformerBlockSubmodules(layer_specs=layer_specs)
return block_spec
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Retro's cross attention modules for the encoder block."""
from functools import partial
from typing import Callable, List, Optional, Tuple, Type
import torch
from torch import Tensor
from megatron.core import InferenceParams
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.models.retro.base_attention import BaseRetroCrossAttention
from megatron.core.models.retro.config import RetroConfig
from megatron.core.models.retro.utils import get_all_true_mask
from megatron.core.transformer.module import MegatronModule
class RetroEncoderCrossAttention(BaseRetroCrossAttention):
"""Retro encoder's cross attention operator.
See this paper for more details: https://arxiv.org/abs/2112.04426.
Neighboring chunks are retrieved from the chunk database, encoded, and
used by the decoder layers for chunked cross attention.
Args:
config (RetroConfig): Retro config.
submodules (CrossAttentionSubmodules): Cross attention submodules.
layer_number (int): Layer number within transformer block.
attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
"""
def forward(
self,
hidden_states: Tensor,
attention_mask: Tensor,
key_value_states: Tensor = None,
inference_params: InferenceParams = None,
# rotary_pos_emb: Tensor = None, # unsupported for retro.
) -> List[Tuple[Tensor, Optional[Tensor], Tensor]]:
"""Cross attention for Retro encoder.
Notation:
ns : Sequence length.
bs : Batch size.
d : Hidden size.
l : Number of chunks per sample (i.e., seq_length/chunk_length).
k : Number of neighbors.
r : Number of retrieved tokens (neighbors + continuation).
Args:
hidden_states (Tensor): Transformer layer hidden states.
attention_mask (Tensor): Attention mask.
key_value_states (Tensor): Neighbor embeddings.
inference_params (InferenceParams): Inference params.
Returns:
List of tuples, where each tuple is (attention_output, attention_bias, residual).
"""
# Input shape. [ r, bs*l*k, d ]
ns, bs, d = hidden_states.shape
# Reshape sequence into neighboring chunks.
# - hidden_states: [ r, bs*l*k, d ]
# - chunked_outputs: [ r, bs*l, k, d ]
chunked_outputs = hidden_states.reshape(
self.retro_retrieved_length, -1, self.retro_num_neighbors, d
)
# flash attn: [ b, h, sq, sk ]
# fused attn: [ b, 1, 1, sq ]
chunked_output_mask = get_all_true_mask(
size=(1, 1, chunked_outputs.shape[0], key_value_states.shape[0]),
device=chunked_outputs.device,
)
# Per-chunk attention.
attention_output_tuples = []
for k in range(self.retro_num_neighbors):
# Attend to current neighboring chunks.
# - chunked_output: [ r, bs*l, d ]
# - key_value_states: [ m, bs*l, d ]
# - attention_output: [ r, bs*l, d ]
# - attention_bias: [ d ]
chunked_output = chunked_outputs[:, :, k].contiguous()
attention_output, attention_bias = self.attn(
hidden_states=chunked_output, # Q (neighbor embedding)
attention_mask=chunked_output_mask,
key_value_states=key_value_states, # K, V (hidden act)
)
# Residual connection. [ r, bs*l, d ]
residual = chunked_output
# Collect tensors.
attention_output_tuples.append((attention_output, attention_bias, residual,))
# Output. (List[Tuple[( [ r, bs*l, d ], [ d ] )]])
return attention_output_tuples
class RetroEncoderBiasDropoutAdd(MegatronModule):
"""Retro encoder's bias-dropout-add operator.
This operator applies bias-dropout-add individually on each neighboring
chunk that is retrieved from the chunk database.
Args:
config (RetroConfig): Retro config.
"""
def __init__(
self, config: RetroConfig,
):
super().__init__(config=config)
self.retro_num_neighbors = config.retro_num_neighbors
@classmethod
def _forward(
cls,
x_with_bias: List[Tuple[Tensor, Optional[Tensor], Tensor]],
residual: Tensor,
prob: float,
retro_num_neighbors: int,
bias_dropout_add: Callable,
) -> Tensor:
"""Per-chunk bias-dropout-add.
Args:
x_with_bias (dict): Attention output and bias tuple.
residual (Tensor): Transformer layer residual.
prob (float): Dropout probability.
retro_num_neighbors (int): Number of retrieved neighbor chunks (e.g., 2).
bias_dropout_add (Callable): Bias-dropout-add function.
Returns:
Output of bias-dropout-add.
"""
# Re-enable torch grad to enable fused optimization.
with torch.enable_grad():
# Per-neighbor bias-dropout-add.
# - attention_output: [ r, bs*l, d ]
# - attention_bias: [ d ]
# - residual: [ r, bs*l, d ]
# - output: [ r, bs*l, d ]
outputs = [
bias_dropout_add(
(
attention_output,
None if attention_bias is None else attention_bias.expand_as(residual),
),
residual,
prob,
)
for attention_output, attention_bias, residual in x_with_bias
]
# Concatenate outputs (to shape [r, k*bs*l, d]; see notation above).
r, _, d = outputs[0].shape
output = torch.stack(outputs, dim=1).reshape(r, -1, d)
# Output. [ r, k*bs*l, d ]
return output
def forward(self, training: bool, fused: bool) -> partial:
"""Retro decoder bias-dropout-add.
Args:
training (bool): If training, then apply dropout.
fused (bool): Fuse bias-dropout-add.
Returns:
A partial function for performing bias-dropout-add.
"""
return partial(
self._forward,
retro_num_neighbors=self.retro_num_neighbors,
bias_dropout_add=get_bias_dropout_add(training, fused),
)
class RetroEncoderLayerNorm(MegatronModule):
"""Retro encoder's layernorm operator.
This operator applies layernorm individually on each neighboring chunk that
is retrieved from the chunk database, and then concatenates the chunks into
a single tensor.
Args:
config (RetroConfig): Retro config.
submodules (Type): Layer norm class. (Named 'submodules' to fit external interface.)
"""
def __init__(
self, config: RetroConfig, submodules: Type, **kwargs: dict,
):
super().__init__(config=config)
norm_class = submodules
self.norm = norm_class(config=config, **kwargs)
self.retro_num_neighbors = config.retro_num_neighbors
def forward(self, input: Tensor) -> Tensor:
"""Per-chunk layer norm.
Args:
input (Tensor): Input chunks, concatenated into a single tensor.
Returns:
Output of the layer norm.
"""
# Input shape: [ r, k*bs*l, d ]. (see notation above in attention module)
# Split input into 'num_neighbors' tensors.
chunk_size = input.shape[1] // self.retro_num_neighbors
inputs = torch.split(input, chunk_size, dim=1)
# Norm.
outputs = [self.norm(inp.contiguous()) for inp in inputs]
# Concatenate layer norms (to shape [r, k*bs*l, d]; see notation above).
r, _, d = inputs[0].shape
output = torch.stack(outputs, dim=1).reshape(r, -1, d)
# Output. [ r, k*bs*l, d ]
return output
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Specs for Retro encoder."""
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.models.gpt.gpt_layer_specs import (
get_gpt_layer_local_spec,
get_gpt_layer_with_transformer_engine_spec,
)
from megatron.core.models.retro.config import RetroConfig
from megatron.core.models.retro.encoder_attention import (
RetroEncoderBiasDropoutAdd,
RetroEncoderCrossAttention,
RetroEncoderLayerNorm,
)
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer import ModuleSpec
from megatron.core.transformer.attention import CrossAttentionSubmodules
try:
from megatron.core.transformer.custom_layers.transformer_engine import (
TEColumnParallelLinear,
TEDotProductAttention,
TENorm,
TERowParallelLinear,
)
except ImportError:
TEColumnParallelLinear = None
TEDotProductAttention = None
TENorm = None
TERowParallelLinear = None
#print("Do not support transformer_engine")
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.mlp import MLP, MLPSubmodules
from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
def get_retro_encoder_layer_te_spec() -> ModuleSpec:
"""Retro encoder TE spec (uses Transformer Engine components).
A Retro encoder layer uses custom attention, bias-dropout-add, and layernorm
operators to encode neighboring chunks that are retrieved from the chunk
database. Each operator is responsible for iterating the retrieved chunks
and processing them individually.
Returns:
A module spec if Transformer Engine modules.
"""
spec = get_gpt_layer_with_transformer_engine_spec()
spec.submodules.pre_cross_attn_layernorm = TENorm
spec.submodules.cross_attention = ModuleSpec(
module=RetroEncoderCrossAttention,
params={"attn_mask_type": AttnMaskType.padding,},
submodules=CrossAttentionSubmodules(
linear_q=TEColumnParallelLinear,
linear_kv=TEColumnParallelLinear,
core_attention=TEDotProductAttention,
linear_proj=TERowParallelLinear,
),
)
spec.submodules.cross_attn_bda = ModuleSpec(module=RetroEncoderBiasDropoutAdd)
spec.submodules.pre_mlp_layernorm = ModuleSpec(module=RetroEncoderLayerNorm, submodules=TENorm,)
spec.submodules.mlp = ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,
),
)
return spec
def get_retro_encoder_layer_local_spec() -> ModuleSpec:
"""Retro encoder local spec (uses Megatron-Core components).
A Retro encoder layer uses custom attention, bias-dropout-add, and layernorm
operators to encode neighboring chunks that are retrieved from the chunk
database. Each operator is responsible for iterating the retrieved chunks
and processing them individually.
Returns:
A module spec if local modules.
"""
spec = get_gpt_layer_local_spec()
spec.submodules.pre_cross_attn_layernorm = FusedLayerNorm
spec.submodules.cross_attention = ModuleSpec(
module=RetroEncoderCrossAttention,
params={"attn_mask_type": AttnMaskType.padding,},
submodules=CrossAttentionSubmodules(
linear_q=ColumnParallelLinear,
linear_kv=ColumnParallelLinear,
core_attention=DotProductAttention,
linear_proj=RowParallelLinear,
),
)
spec.submodules.cross_attn_bda = ModuleSpec(module=RetroEncoderBiasDropoutAdd)
spec.submodules.pre_mlp_layernorm = ModuleSpec(
module=RetroEncoderLayerNorm, submodules=FusedLayerNorm,
)
spec.submodules.mlp = ModuleSpec(
module=MLP,
submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,),
)
spec.submodules.sharded_state_dict_keys_map = {
'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
} # pre_mlp_layernorm doesn't need remapping
return spec
def get_retro_encoder_block_spec(
config: RetroConfig, use_transformer_engine: bool
) -> TransformerBlockSubmodules:
"""Retro encoder block spec.
The retro encoder block consists of one customized Retro encoder layer
(layer 1), and all of the following layers are standard GPT layers.
Args:
config (RetroConfig): Retro config.
use_transformer_engine (bool): If True, use Transformer Engine (instead of local modules).
Returns:
Transformer block submodules for the given spec.
"""
# Num layers.
num_layers = config.retro_encoder_num_layers
retro_layer_numbers = [1]
# Layer specs.
gpt_layer_spec = (
get_gpt_layer_with_transformer_engine_spec()
if use_transformer_engine
else get_gpt_layer_local_spec()
)
get_retro_encoder_layer_spec = (
get_retro_encoder_layer_te_spec
if use_transformer_engine
else get_retro_encoder_layer_local_spec
)
retro_layer_spec = get_retro_encoder_layer_spec()
for spec in (gpt_layer_spec, retro_layer_spec):
spec.params["hidden_dropout"] = config.retro_encoder_hidden_dropout
spec.submodules.self_attention.params["attn_mask_type"] = AttnMaskType.padding
spec.submodules.self_attention.submodules.core_attention = ModuleSpec(
module=TEDotProductAttention if use_transformer_engine else DotProductAttention,
params={"attention_dropout": config.retro_encoder_attention_dropout,},
)
layer_specs = []
for layer_number in range(1, num_layers + 1):
if layer_number in retro_layer_numbers:
layer_specs.append(retro_layer_spec)
else:
layer_specs.append(gpt_layer_spec)
# Block spec.
block_spec = TransformerBlockSubmodules(layer_specs=layer_specs)
return block_spec
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Retro Model."""
from typing import Dict, Optional
from torch import Tensor
from megatron.core import InferenceParams
from megatron.core.dist_checkpointing.mapping import ShardedStateDict
from megatron.core.models.gpt import GPTModel
class RetroModel(GPTModel):
"""Retro Model.
A Retro model mostly re-uses the GPTModel interface, with the only difference
being the embedding of the 'context' this is used by Retro for processing
neighbor tokens. This embedded context is then forwarded to the Transformer
Block.
"""
def forward(
self,
input_ids: Tensor,
position_ids: Tensor,
attention_mask: Tensor,
context_input_ids: Tensor = None,
context_position_ids: Tensor = None,
context_mask: Tensor = None,
decoder_input: Tensor = None,
labels: Tensor = None,
inference_params: InferenceParams = None,
) -> Tensor:
"""RetroModel forward method.
Foward input tokens & mask, along with neighbor tokens & mask, through
the Retro model..
Args:
input_ids (Tensor): Input token IDs.
position_ids (Tensor): Input position IDs.
attention_mask (Tensor): Input attention mask.
context_input_ids (Tensor): Context (i.e., neighbor) token IDs.
context_position_ids (Tensor): Context (i.e., neighbor) position IDs.
context_mask (Tensor): Context (i.e., neighbor) attention mask.
decoder_input (Tensor): When using pipeline parallelism, input_ids and position_ids will only be used on the first stage, and for all other stages decoder_input will be provided via communication from the previous stage.
labels (Tensor): The labels of dimension [batch size, seq length].
inference_params (InferenceParams): Parameters for inference.
Returns:
Output tensor of forward pass.
"""
# Argument shapes:
# Notation:
# ns : Sequence length.
# bs : Batch size.
# d : Hidden size.
# l : Number of chunks per sample (i.e., seq_length/chunk_length).
# k : Number of neighbors.
# r : Number of retrieved tokens (neighbors + continuation).
# - input_ids: [ bs, ns ]
# - context_ids: [ k*bs*l, r ]
# - context: [ r, k*bs*l, d ]
# - output: [ ns, bs, d ]
# Context embedding (e.g., for Retro neighbor tokens).
if context_input_ids is not None:
context = self.embedding(context_input_ids, context_position_ids)
else:
context = None
# Call GPTModel.forward, and pass in embedded context.
return super().forward(
input_ids=input_ids,
position_ids=position_ids,
attention_mask=attention_mask,
decoder_input=decoder_input,
labels=labels,
inference_params=inference_params,
extra_block_kwargs={"context": context, "context_mask": context_mask,},
)
def sharded_state_dict(
self, prefix: str = '', sharded_offsets: tuple = (), metadata: Optional[Dict] = None
) -> ShardedStateDict:
"""Get sharded state dict.
Args:
prefix (str): Module name prefix.
sharded_offsets (tuple): Offsets of local shard within global tensor.
metadata (Optional[Dict]): Shard metadata.
Returns:
A <ShardedStateDict> ?
"""
metadata = metadata or {}
metadata['non_homogeneous_layers'] = True
return super().sharded_state_dict(prefix, sharded_offsets, metadata)
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import os
import torch
def get_config_path(project_dir: str) -> str:
"""Config copy stored within retro project dir."""
return os.path.join(project_dir, "config.json")
def get_gpt_data_dir(project_dir: str) -> str:
"""Get project-relative directory of GPT bin/idx datasets."""
return os.path.join(project_dir, "data")
# ** Note ** : Retro's compatibility between cross attention and Flash/Fused
# Attention is currently a work in progress. We default to returning None for
# now.
# def get_all_true_mask(size, device):
# return torch.full(size=size, fill_value=True, dtype=torch.bool, device=device)
def get_all_true_mask(size, device):
return None
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from typing import Optional, Union
import torch
from megatron.core.models.common.vision_module.vision_module import VisionModule
from megatron.core.transformer.custom_layers.transformer_engine import TENorm
from megatron.core.transformer.enums import ModelType
from megatron.core.transformer.spec_utils import ModuleSpec, build_module
from megatron.core.transformer.transformer_block import TransformerBlock
from megatron.core.transformer.transformer_config import TransformerConfig
# Note: This is under development and is missing features like position embedding interpolation.
class CLIPViTModel(VisionModule):
"""CLIP ViT vision model.
Args:
transformer_config (TransformerConfig): Transformer config.
transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers.
ln_pre_impl (ModuleSpec or type): Specifies the layer norm type to use for ln_pre.
patch_dim (int): Image patch size.
img_h (int): Input image height.
img_w (int): Input image width.
add_class_token (bool, optional): Include a class token. Defaults to True.
class_token_len (int): Class token length. Defaults to 1 but 8 may be faster.
"""
def __init__(
self,
transformer_config: TransformerConfig,
transformer_layer_spec: ModuleSpec,
ln_pre_impl: Union[ModuleSpec, type] = TENorm,
patch_dim: int = 14,
img_h: int = 336,
img_w: int = 336,
add_class_token: bool = True,
class_token_len: int = 1,
) -> None:
super().__init__(config=transformer_config)
self.visual_hidden_size = transformer_config.hidden_size
self.patch_dim = patch_dim
self.img_h = img_h
self.img_w = img_w
assert self.img_h % self.patch_dim == 0
assert self.img_w % self.patch_dim == 0
self.num_patches_per_dim_h = self.img_h // self.patch_dim
self.num_patches_per_dim_w = self.img_w // self.patch_dim
self.num_patches = self.num_patches_per_dim_h * self.num_patches_per_dim_w
self.add_class_token = add_class_token
self.class_token_len = class_token_len
self.seq_length = self.num_patches + (self.class_token_len if self.add_class_token else 0)
self.conv1 = torch.nn.Conv2d(
in_channels=3,
out_channels=self.visual_hidden_size,
kernel_size=self.patch_dim,
stride=self.patch_dim,
bias=False,
)
self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()
self.position_embeddings = torch.nn.Embedding(self.seq_length, self.visual_hidden_size)
self.add_class_token = add_class_token
if self.add_class_token:
self.class_token = torch.nn.Parameter(
torch.randn(1, self.class_token_len, self.visual_hidden_size)
)
self.ln_pre = build_module(
ln_pre_impl,
config=transformer_config,
hidden_size=self.visual_hidden_size,
eps=transformer_config.layernorm_epsilon,
)
self.model_type = ModelType.encoder_or_decoder
# Transformer layers.
# TODO: Follow-up changes will make pre and post_process configurable. They are needed for supporting pipeline parallelism.
# Note: a final layer norm and/or linear layer present in some implementations are omitted here. They can be added separately where needed.
self.decoder = TransformerBlock(
config=transformer_config,
spec=transformer_layer_spec,
pre_process=True,
post_process=False,
)
def set_input_tensor(self, input_tensor: torch.Tensor) -> None:
"""Sets input tensor to the model.
Args:
input_tensor (Tensor): Sets the input tensor for the model.
"""
self.decoder.set_input_tensor(input_tensor)
def forward(
self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None
) -> torch.Tensor:
"""Forward function of the CLIP ViT Model. This function passes the input tensors
through the embedding layer and then the transformer.
Args:
x (torch.Tensor): input data of shape [batch, img_h, img_w]
attention_mask (torch.Tensor with dtype=bool): Attention mask to use. If none, all ones.
Returns:
x (torch.Tensor): output after final transformer block of shape [b, s, h].
"""
x = self.conv1(x) # shape = [batch, hidden_size, grid, grid]
x = x.reshape(x.shape[0], x.shape[1], -1) # [batch, hidden_size, grid ** 2]
x = x.permute(0, 2, 1) # [batch, grid ** 2, hidden_size]
if self.add_class_token:
class_token = self.class_token.expand(
x.shape[0], -1, -1
) # [batch, class_token_len, hidden_size]
x = torch.cat(
[class_token, x], dim=1
) # [batch, grid ** 2 + class_token_len, hidden_size]
x = x + self.position_embeddings(self.position_ids)
x = self.ln_pre(x)
x = x.permute(1, 0, 2) # [b, s, h] -> [s, b, h]
if attention_mask is None:
attention_mask = torch.ones(1, 1, x.shape[0], x.shape[0]).cuda() # [1, 1, s, s]
attention_mask = attention_mask < 0.5 # to bool
x = self.decoder(x.contiguous(), attention_mask)
x = x.permute(1, 0, 2) # [s, b, h] -> [b, s, h]
x = x.contiguous()
return x
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment