Commit af238596 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2392 failed with stages
in 0 seconds
# coding=utf-8
# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Mistral model."""
import inspect
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from .activations import ACT2FN
from transformers.cache_utils import Cache, DynamicCache
from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from transformers.configuration_utils import PretrainedConfig
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
try:
from .flash_self_attn import compute_flash_attention
except ImportError as e:
compute_flash_attention = None
class MistralConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
[mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
[mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`MistralModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 14336):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer encoder.
num_key_value_heads (`int`, *optional*, defaults to 8):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
allows sequence of up to 4096*32 tokens.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
The id of the padding token.
bos_token_id (`int`, *optional*, defaults to 1):
The id of the "beginning-of-sequence" token.
eos_token_id (`int`, *optional*, defaults to 2):
The id of the "end-of-sequence" token.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be tied.
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
sliding_window (`int`, *optional*, defaults to 4096):
Sliding window attention window size. If not specified, will default to `4096`.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
```python
>>> from transformers import MistralModel, MistralConfig
>>> # Initializing a Mistral 7B style configuration
>>> configuration = MistralConfig()
>>> # Initializing a model from the Mistral 7B style configuration
>>> model = MistralModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "mistral"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=32000,
hidden_size=4096,
intermediate_size=14336,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=8,
hidden_act="silu",
max_position_embeddings=4096 * 32,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=None,
bos_token_id=1,
eos_token_id=2,
tie_word_embeddings=False,
rope_theta=10000.0,
sliding_window=4096,
attention_dropout=0.0,
use_ft_flash_attn=False,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.sliding_window = sliding_window
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.attention_dropout = attention_dropout
self.use_ft_flash_attn = use_ft_flash_attn
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "MistralConfig"
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)
# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
class MistralRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
MistralRMSNorm is equivalent to T5LayerNorm
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
return self.weight * hidden_states.to(input_dtype)
# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
# TODO @Arthur no longer copied from LLama after static cache
class MistralRotaryEmbedding(nn.Module):
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
super().__init__()
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
# Build here to make `torch.jit.trace` work.
self._set_cos_sin_cache(
seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
freqs = torch.outer(t, self.inv_freq)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb = torch.cat((freqs, freqs), dim=-1)
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
def forward(self, x, seq_len=None):
# x: [bs, num_attention_heads, seq_len, head_size]
if seq_len > self.max_seq_len_cached:
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
return (
self.cos_cached[:seq_len].to(dtype=x.dtype),
self.sin_cached[:seq_len].to(dtype=x.dtype),
)
# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2:]
return torch.cat((-x2, x1), dim=-1)
# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
# TODO @Arthur no longer copied from LLama after static cache
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors.
Args:
q (`torch.Tensor`): The query tensor.
k (`torch.Tensor`): The key tensor.
cos (`torch.Tensor`): The cosine part of the rotary embedding.
sin (`torch.Tensor`): The sine part of the rotary embedding.
position_ids (`torch.Tensor`):
The position indices of the tokens corresponding to the query and key tensors. For example, this can be
used to pass offsetted position ids when working with a KV-cache.
unsqueeze_dim (`int`, *optional*, defaults to 1):
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
class MistralMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.intermediate_size = config.intermediate_size
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
self.act_fn = ACT2FN[config.hidden_act]
def forward(self, x):
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
"""
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
"""
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
if n_rep == 1:
return hidden_states
hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
class MistralAttention(nn.Module):
"""
Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
and "Generating Long Sequences with Sparse Transformers".
"""
def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None):
super().__init__()
self.config = config
self.layer_idx = layer_idx
if layer_idx is None:
logger.warning_once(
f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
"lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
"when creating this class."
)
self.hidden_size = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.hidden_size // self.num_heads
self.num_key_value_heads = config.num_key_value_heads
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
self.is_causal = True
self.attention_dropout = config.attention_dropout
if (self.head_dim * self.num_heads) != self.hidden_size:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {self.num_heads})."
)
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
self.rotary_emb = MistralRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
self.use_ft_flash_attn = (
config.use_ft_flash_attn and compute_flash_attention is not None
)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
cu_input_lens: Optional[torch.Tensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
if self.layer_idx is None:
raise ValueError(
f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
"with a layer index."
)
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
if past_key_value is not None:
cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
# repeat k/v heads if n_kv_heads < n_heads
key_states = repeat_kv(key_states, self.num_key_value_groups)
value_states = repeat_kv(value_states, self.num_key_value_groups)
if not self.use_ft_flash_attn:
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
raise ValueError(
f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
f" {attn_weights.size()}"
)
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
)
attn_weights = attn_weights + attention_mask
# upcast attention to fp32
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
attn_output = torch.matmul(attn_weights, value_states)
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
else:
assert output_attentions in [None, False]
# (batch, head, seq_length, head_features) -> (batch, seq_length, head, head_features)
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
if attention_mask is not None:
# [bsz, 1, tgt_seq_len, src_seq_len] -> [bsz, seq_len]
attention_mask = attention_mask[:, 0, -1].contiguous()
out_dtype = value_states.dtype
attn_output = compute_flash_attention(
query_states,
key_states,
value_states,
attention_mask,
cu_input_lens,
True,
self.training,
)
attn_output = attn_output.to(out_dtype)
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
class MistralDecoderLayer(nn.Module):
def __init__(self, config: MistralConfig, layer_idx: int):
super().__init__()
self.hidden_size = config.hidden_size
self.self_attn = MistralAttention(config, layer_idx)
self.mlp = MistralMLP(config)
self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
cu_input_lens: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
`(batch, sequence_length)` where padding elements are indicated by 0.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
"""
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
# Self Attention
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
cu_input_lens=cu_input_lens,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)
hidden_states = residual + hidden_states
# Fully Connected
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights,)
if use_cache:
outputs += (present_key_value,)
return outputs
MISTRAL_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`MistralConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"The bare Mistral Model outputting raw hidden-states without any specific head on top.",
MISTRAL_START_DOCSTRING,
)
class MistralPreTrainedModel(PreTrainedModel):
config_class = MistralConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["MistralDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn_2 = True
_supports_sdpa = True
_supports_cache_class = True
def _init_weights(self, module):
std = self.config.initializer_range
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
MISTRAL_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
`past_key_values`).
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
information on the default strategy.
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.n_positions - 1]`.
[What are position IDs?](../glossary#position-ids)
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
Two formats are allowed:
- a [`~cache_utils.Cache`] instance;
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
cache format.
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
legacy cache format will be returned.
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Mistral Model outputting raw hidden-states without any specific head on top.",
MISTRAL_START_DOCSTRING,
)
class MistralModel(MistralPreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
Args:
config: MistralConfig
"""
def __init__(self, config: MistralConfig):
super().__init__(config)
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
self.layers = nn.ModuleList(
[MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
@add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
cu_input_lens: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# retrieve input_ids and inputs_embeds
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
elif input_ids is not None:
batch_size, seq_length = input_ids.shape
elif inputs_embeds is not None:
batch_size, seq_length, _ = inputs_embeds.shape
else:
raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
past_key_values_length = 0
if use_cache:
use_legacy_cache = not isinstance(past_key_values, Cache)
if use_legacy_cache:
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
past_key_values_length = past_key_values.get_usable_length(seq_length)
if position_ids is None:
device = input_ids.device if input_ids is not None else inputs_embeds.device
position_ids = torch.arange(
past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
)
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
else:
position_ids = position_ids.view(-1, seq_length).long()
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
if cu_input_lens is None:
attention_mask = _prepare_4d_causal_attention_mask(
attention_mask,
(batch_size, seq_length),
inputs_embeds,
past_key_values_length,
sliding_window=self.config.sliding_window,
)
hidden_states = inputs_embeds
# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
next_decoder_cache = None
for decoder_layer in self.layers:
if output_hidden_states:
all_hidden_states += (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
decoder_layer.__call__,
hidden_states,
attention_mask,
position_ids,
cu_input_lens,
past_key_values,
output_attentions,
use_cache,
use_reentrant=False
)
else:
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_values,
output_attentions=output_attentions,
cu_input_lens=cu_input_lens,
use_cache=use_cache,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache = layer_outputs[2 if output_attentions else 1]
if output_attentions:
all_self_attns += (layer_outputs[1],)
hidden_states = self.norm(hidden_states)
# add hidden states from the last decoder layer
if output_hidden_states:
all_hidden_states += (hidden_states,)
next_cache = None
if use_cache:
next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
if not return_dict:
return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
return BaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=next_cache,
hidden_states=all_hidden_states,
attentions=all_self_attns,
)
class MistralForCausalLM(MistralPreTrainedModel):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
super().__init__(config)
self.model = MistralModel(config)
self.vocab_size = config.vocab_size
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.model.embed_tokens
def set_input_embeddings(self, value):
self.model.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model = decoder
def get_decoder(self):
return self.model
@add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
cu_input_lens: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Example:
```python
>>> from transformers import AutoTokenizer, MistralForCausalLM
>>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")
>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
cu_input_lens=cu_input_lens,
)
hidden_states = outputs[0]
logits = self.lm_head(hidden_states)
logits = logits.float()
loss = None
if labels is not None:
# Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
shift_logits = shift_logits.view(-1, self.config.vocab_size)
shift_labels = shift_labels.view(-1)
# Ensure tensors are on the same device
shift_labels = shift_labels.to(shift_logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits, shift_labels)
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
):
# Omit tokens covered by past_key_values
if past_key_values is not None:
if isinstance(past_key_values, Cache):
cache_length = past_key_values.get_seq_length()
past_length = past_key_values.seen_tokens
max_cache_length = past_key_values.get_max_length()
else:
cache_length = past_length = past_key_values[0][0].shape[2]
max_cache_length = None
# Keep only the unprocessed tokens:
# 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
# some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
# input)
if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
# 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
# input_ids based on the past_length.
elif past_length < input_ids.shape[1]:
input_ids = input_ids[:, past_length:]
# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
if (
max_cache_length is not None
and attention_mask is not None
and cache_length + input_ids.shape[1] > max_cache_length
):
attention_mask = attention_mask[:, -max_cache_length:]
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1]:]
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"attention_mask": attention_mask,
}
)
return model_inputs
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
@add_start_docstrings(
"""
The Mistral Model transformer with a sequence classification head on top (linear layer).
[`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
(e.g. GPT-2) do.
Since it does classification on the last token, it requires to know the position of the last token. If a
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
each row of the batch).
""",
MISTRAL_START_DOCSTRING,
)
# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
class MistralForSequenceClassification(MistralPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.model = MistralModel(config)
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.model.embed_tokens
def set_input_embeddings(self, value):
self.model.embed_tokens = value
@add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.model(
input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
logits = self.score(hidden_states)
if input_ids is not None:
batch_size = input_ids.shape[0]
else:
batch_size = inputs_embeds.shape[0]
if self.config.pad_token_id is None and batch_size != 1:
raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
if self.config.pad_token_id is None:
sequence_lengths = -1
else:
if input_ids is not None:
# if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
sequence_lengths = sequence_lengths % input_ids.shape[-1]
sequence_lengths = sequence_lengths.to(logits.device)
else:
sequence_lengths = -1
pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
loss = None
if labels is not None:
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(pooled_logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(pooled_logits, labels)
if not return_dict:
output = (pooled_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutputWithPast(
loss=loss,
logits=pooled_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
# Copyright 2024 The HuggingFace Team. All rights reserved.
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import Optional, Tuple
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import is_torch_available, logging
logger = logging.get_logger(__name__)
if is_torch_available():
import torch
def _compute_default_rope_parameters(
config: Optional[PretrainedConfig] = None,
device: Optional["torch.device"] = None,
seq_len: Optional[int] = None,
**rope_kwargs,
) -> Tuple["torch.Tensor", float]:
"""
Computes the inverse frequencies according to the original RoPE implementation
Args:
config ([`~transformers.PretrainedConfig`]):
The model configuration.
device (`torch.device`):
The device to use for initialization of the inverse frequencies.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
"""
if config is not None and len(rope_kwargs) > 0:
raise ValueError(
"Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
)
if len(rope_kwargs) > 0:
base = rope_kwargs["base"]
dim = rope_kwargs["dim"]
elif config is not None:
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
attention_factor = 1.0 # Unused in this type of RoPE
# Compute the inverse frequencies
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
return inv_freq, attention_factor
def _compute_linear_scaling_rope_parameters(
config: Optional[PretrainedConfig] = None,
device: Optional["torch.device"] = None,
seq_len: Optional[int] = None,
**rope_kwargs,
) -> Tuple["torch.Tensor", float]:
"""
Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
Args:
config ([`~transformers.PretrainedConfig`]):
The model configuration.
device (`torch.device`):
The device to use for initialization of the inverse frequencies.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
"""
if config is not None and len(rope_kwargs) > 0:
raise ValueError(
"Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
)
if len(rope_kwargs) > 0:
factor = rope_kwargs["factor"]
elif config is not None:
factor = config.rope_scaling["factor"]
# Gets the default RoPE parameters
inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
# Then applies linear scaling to the frequencies.
# NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
# applying scaling to the inverse frequencies is equivalent.
inv_freq /= factor
return inv_freq, attention_factor
def _compute_dynamic_ntk_parameters(
config: Optional[PretrainedConfig] = None,
device: Optional["torch.device"] = None,
seq_len: Optional[int] = None,
**rope_kwargs,
) -> Tuple["torch.Tensor", float]:
"""
Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
Args:
config ([`~transformers.PretrainedConfig`]):
The model configuration.
device (`torch.device`):
The device to use for initialization of the inverse frequencies.
seq_len (`int`, *optional*):
The current sequence length, used to update the dynamic RoPE at inference time.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
"""
# TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
if config is not None and len(rope_kwargs) > 0:
raise ValueError(
"Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
)
if len(rope_kwargs) > 0:
base = rope_kwargs["base"]
dim = rope_kwargs["dim"]
max_position_embeddings = rope_kwargs["max_position_embeddings"]
factor = rope_kwargs["factor"]
elif config is not None:
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
max_position_embeddings = config.max_position_embeddings
factor = config.rope_scaling["factor"]
attention_factor = 1.0 # Unused in this type of RoPE
# seq_len: default to max_position_embeddings, e.g. at init time
seq_len = seq_len if seq_len is not None else max_position_embeddings
# Compute the inverse frequencies
base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
return inv_freq, attention_factor
def _compute_yarn_parameters(
config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
) -> Tuple["torch.Tensor", float]:
"""
Computes the inverse frequencies with NTK scaling. Please refer to the
[original paper](https://arxiv.org/abs/2309.00071)
Args:
config ([`~transformers.PretrainedConfig`]):
The model configuration.
device (`torch.device`):
The device to use for initialization of the inverse frequencies.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin.
"""
# No need to keep BC with yarn, unreleased when this new pattern was created.
if len(rope_kwargs) > 0:
raise ValueError(
f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
)
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
max_position_embeddings = config.max_position_embeddings
factor = config.rope_scaling["factor"]
# Sets the attention factor as suggested in the paper
attention_factor = config.rope_scaling.get("attention_factor")
if attention_factor is None:
attention_factor = 0.1 * math.log(factor) + 1.0
# Optional config options
# beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
beta_fast = config.rope_scaling.get("beta_fast") or 32
beta_slow = config.rope_scaling.get("beta_slow") or 1
# Compute the inverse frequencies
def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
"""Inverse dimension formula to find the dimension based on the number of rotations"""
return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
"""Find dimension range bounds based on rotations"""
low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
return max(low, 0), min(high, dim - 1)
def linear_ramp_mask(min, max, dim):
if min == max:
max += 0.001 # Prevent singularity
linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
ramp_func = torch.clamp(linear_func, 0, 1)
return ramp_func
pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
inv_freq_extrapolation = 1.0 / pos_freqs
inv_freq_interpolation = 1.0 / (factor * pos_freqs)
low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
# Get n-dimensional rotational scaling corrected for extrapolation
inv_freq_mask = 1 - linear_ramp_mask(low, high, dim // 2).float().to(device)
inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
return inv_freq, attention_factor
def _compute_longrope_parameters(
config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
) -> Tuple["torch.Tensor", float]:
"""
Computes the inverse frequencies with LongRoPE scaling. Please refer to the
[original implementation](https://github.com/microsoft/LongRoPE)
Args:
config ([`~transformers.PretrainedConfig`]):
The model configuration.
device (`torch.device`):
The device to use for initialization of the inverse frequencies.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin.
"""
# TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
# No need to keep BC with longrope, unreleased when this new pattern was created.
if len(rope_kwargs) > 0:
raise ValueError(
"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
f"{rope_kwargs}"
)
base = config.rope_theta
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
long_factor = config.rope_scaling["long_factor"]
short_factor = config.rope_scaling["short_factor"]
factor = config.rope_scaling.get("factor")
attention_factor = config.rope_scaling.get("attention_factor")
# NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
# `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
# values to compute the default attention scaling factor, instead of using `factor`.
if hasattr(config, "original_max_position_embeddings"):
max_position_embeddings = config.original_max_position_embeddings
expanded_max_position_embeddings = config.max_position_embeddings
factor = expanded_max_position_embeddings / max_position_embeddings
else:
max_position_embeddings = config.max_position_embeddings
expanded_max_position_embeddings = max_position_embeddings * factor
# Sets the attention factor as suggested in the paper
if attention_factor is None:
if factor <= 1.0:
attention_factor = 1.0
else:
attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))
# Compute the inverse frequencies -- scaled based on the target sequence length
if expanded_max_position_embeddings > max_position_embeddings:
ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
else:
ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
return inv_freq, attention_factor
def _compute_llama3_parameters(
config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
) -> Tuple["torch.Tensor", float]:
"""
Computes the inverse frequencies for llama 3.1.
Args:
config ([`~transformers.PretrainedConfig`]):
The model configuration.
device (`torch.device`):
The device to use for initialization of the inverse frequencies.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
rope_kwargs (`Dict`, *optional*):
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
Returns:
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin.
"""
# Gets the default RoPE parameters
inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
factor = config.rope_scaling["factor"] # `8` in the original implementation
low_freq_factor = config.rope_scaling["low_freq_factor"] # `1` in the original implementation
high_freq_factor = config.rope_scaling["high_freq_factor"] # `4` in the original implementation
old_context_len = config.rope_scaling["original_max_position_embeddings"] # `8192` in the original implementation
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
new_freqs = []
for freq in inv_freq:
wavelen = 2 * math.pi / freq
if wavelen < high_freq_wavelen:
new_freqs.append(freq)
elif wavelen > low_freq_wavelen:
new_freqs.append(freq / factor)
else:
assert low_freq_wavelen != high_freq_wavelen
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
new_freqs.append((1 - smooth) * freq / factor + smooth * freq)
inv_freq = torch.tensor(new_freqs, dtype=inv_freq.dtype, device=inv_freq.device)
return inv_freq, attention_factor
# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
# parameterizations, as long as the callable has the same signature.
ROPE_INIT_FUNCTIONS = {
"default": _compute_default_rope_parameters,
"linear": _compute_linear_scaling_rope_parameters,
"dynamic": _compute_dynamic_ntk_parameters,
"yarn": _compute_yarn_parameters,
"longrope": _compute_longrope_parameters,
"llama3": _compute_llama3_parameters,
}
def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None):
"""Compare the received keys in `config.rope_scaling` against the expected and optional keys"""
# BC: "rope_type" was originally "type" -- let's gracefully handle it
if "rope_type" not in received_keys and "type" in received_keys:
received_keys -= {"type"}
received_keys.add("rope_type")
missing_keys = required_keys - received_keys
if missing_keys:
raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}")
if optional_keys is not None:
unused_keys = received_keys - required_keys - optional_keys
else:
unused_keys = received_keys - required_keys
if unused_keys:
logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}")
def _validate_default_rope_parameters(config: PretrainedConfig):
rope_scaling = config.rope_scaling
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
required_keys = {"rope_type"}
received_keys = set(rope_scaling.keys())
_check_received_keys(rope_type, received_keys, required_keys)
def _validate_linear_scaling_rope_parameters(config: PretrainedConfig):
rope_scaling = config.rope_scaling
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
required_keys = {"rope_type", "factor"}
received_keys = set(rope_scaling.keys())
_check_received_keys(rope_type, received_keys, required_keys)
factor = rope_scaling["factor"]
if factor is None or not isinstance(factor, float) or factor < 1.0:
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig):
rope_scaling = config.rope_scaling
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
required_keys = {"rope_type", "factor"}
# TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
optional_keys = {"original_max_position_embeddings"}
received_keys = set(rope_scaling.keys())
_check_received_keys(rope_type, received_keys, required_keys, optional_keys)
factor = rope_scaling["factor"]
if factor is None or not isinstance(factor, float) or factor < 1.0:
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
def _validate_yarn_parameters(config: PretrainedConfig):
rope_scaling = config.rope_scaling
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
required_keys = {"rope_type", "factor"}
optional_keys = {"attention_factor", "beta_fast", "beta_slow"}
received_keys = set(rope_scaling.keys())
_check_received_keys(rope_type, received_keys, required_keys, optional_keys)
factor = rope_scaling["factor"]
if factor is None or not isinstance(factor, float) or factor < 1.0:
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
attention_factor = rope_scaling.get("attention_factor")
if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0):
logger.warning(
f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
)
beta_fast = rope_scaling.get("beta_fast")
if beta_fast is not None and not isinstance(beta_fast, float):
logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}")
beta_slow = rope_scaling.get("beta_slow")
if beta_slow is not None and not isinstance(beta_slow, float):
logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}")
if (beta_fast or 32) < (beta_slow or 1):
logger.warning(
f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} "
f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)"
)
def _validate_longrope_parameters(config: PretrainedConfig):
rope_scaling = config.rope_scaling
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
required_keys = {"rope_type", "short_factor", "long_factor"}
# TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
received_keys = set(rope_scaling.keys())
_check_received_keys(rope_type, received_keys, required_keys, optional_keys)
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor)
short_factor = rope_scaling.get("short_factor")
if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor):
logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}")
if not len(short_factor) == dim // 2:
logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}")
long_factor = rope_scaling.get("long_factor")
if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor):
logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}")
if not len(long_factor) == dim // 2:
logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}")
# Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over
# `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is
# unique to longrope (= undesirable)
if hasattr(config, "original_max_position_embeddings"):
logger.warning_once(
"This model has set a `original_max_position_embeddings` field, to be used together with "
"`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`"
"with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, "
"as it is compatible with most model architectures."
)
else:
factor = rope_scaling.get("factor")
if factor is None:
logger.warning("Missing required keys in `rope_scaling`: 'factor'")
elif not isinstance(factor, float) or factor < 1.0:
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
attention_factor = rope_scaling.get("attention_factor")
if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0:
logger.warning(
f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
)
def _validate_llama3_parameters(config: PretrainedConfig):
rope_scaling = config.rope_scaling
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"}
received_keys = set(rope_scaling.keys())
_check_received_keys(rope_type, received_keys, required_keys)
factor = rope_scaling["factor"]
if factor is None or not isinstance(factor, float) or factor < 1.0:
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
low_freq_factor = rope_scaling["low_freq_factor"]
high_freq_factor = rope_scaling["high_freq_factor"]
if low_freq_factor is None or not isinstance(low_freq_factor, float):
logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}")
if high_freq_factor is None or not isinstance(high_freq_factor, float):
logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}")
if high_freq_factor < low_freq_factor:
logger.warning(
"`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor="
f"{high_freq_factor} and low_freq_factor={low_freq_factor}"
)
original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]
if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int):
logger.warning(
"`rope_scaling`'s original_max_position_embeddings field must be an integer, got "
f"{original_max_position_embeddings}"
)
if original_max_position_embeddings >= config.max_position_embeddings:
logger.warning(
"`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got "
f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}"
)
# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types.
ROPE_VALIDATION_FUNCTIONS = {
"default": _validate_default_rope_parameters,
"linear": _validate_linear_scaling_rope_parameters,
"dynamic": _validate_dynamic_scaling_rope_parameters,
"yarn": _validate_yarn_parameters,
"longrope": _validate_longrope_parameters,
"llama3": _validate_llama3_parameters,
}
def rope_config_validation(config: PretrainedConfig):
"""
Validate the RoPE config arguments, given a `PretrainedConfig` object
"""
rope_scaling = getattr(config, "rope_scaling", None) # not a default parameter in `PretrainedConfig`
if rope_scaling is None:
return
# BC: "rope_type" was originally "type"
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default"))
validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type)
if validation_fn is not None:
validation_fn(config)
else:
logger.warning(
f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'"
)
# Copyright (c) Meta Platforms, Inc. and affiliates.
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Implements HSTU (Hierarchical Sequential Transduction Unit) in
Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations
(https://arxiv.org/abs/2402.17152).
"""
import abc
import math
from typing import Callable, Dict, List, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from logging import getLogger
import fbgemm_gpu
from REC.utils.enum_type import InputType
from REC.model.basemodel import BaseModel, l2_norm, all_gather
def truncated_normal(x: torch.Tensor, mean: float, std: float) -> torch.Tensor:
with torch.no_grad():
size = x.shape
tmp = x.new_empty(size + (4,)).normal_()
valid = (tmp < 2) & (tmp > -2)
ind = valid.max(-1, keepdim=True)[1]
x.data.copy_(tmp.gather(-1, ind).squeeze(-1))
x.data.mul_(std).add_(mean)
return x
TIMESTAMPS_KEY = "timestamps"
class RelativeAttentionBiasModule(torch.nn.Module):
@abc.abstractmethod
def forward(
self,
all_timestamps: torch.Tensor,
) -> torch.Tensor:
"""
Args:
all_timestamps: [B, N] x int64
Returns:
torch.float tensor broadcastable to [B, N, N]
"""
pass
class RelativePositionalBias(RelativeAttentionBiasModule):
def __init__(self, max_seq_len: int) -> None:
super().__init__()
self._max_seq_len: int = max_seq_len
self._w = torch.nn.Parameter(
torch.empty(2 * max_seq_len - 1).normal_(mean=0, std=0.02),
)
def forward(
self,
all_timestamps: torch.Tensor,
) -> torch.Tensor:
del all_timestamps
n: int = self._max_seq_len
t = F.pad(self._w[: 2 * n - 1], [0, n]).repeat(n)
t = t[..., :-n].reshape(1, n, 3 * n - 2)
r = (2 * n - 1) // 2
return t[..., r:-r]
class RelativeBucketedTimeAndPositionBasedBias(RelativeAttentionBiasModule):
"""
Bucketizes timespans based on ts(next-item) - ts(current-item).
"""
def __init__(
self,
max_seq_len: int,
num_buckets: int,
bucketization_fn: Callable[[torch.Tensor], torch.Tensor],
) -> None:
super().__init__()
self._max_seq_len: int = max_seq_len
self._ts_w = torch.nn.Parameter(
torch.empty(num_buckets + 1).normal_(mean=0, std=0.02),
)
self._pos_w = torch.nn.Parameter(
torch.empty(2 * max_seq_len - 1).normal_(mean=0, std=0.02),
)
self._num_buckets: int = num_buckets
self._bucketization_fn: Callable[[torch.Tensor], torch.Tensor] = (
bucketization_fn
)
def forward(
self,
all_timestamps: torch.Tensor,
) -> torch.Tensor:
"""
Args:
all_timestamps: (B, N).
Returns:
(B, N, N).
"""
B = all_timestamps.size(0)
N = self._max_seq_len
t = F.pad(self._pos_w[: 2 * N - 1], [0, N]).repeat(N)
t = t[..., :-N].reshape(1, N, 3 * N - 2)
r = (2 * N - 1) // 2
# [B, N + 1] to simplify tensor manipulations.
ext_timestamps = torch.cat(
[all_timestamps, all_timestamps[:, N - 1: N]], dim=1
)
# causal masking. Otherwise [:, :-1] - [:, 1:] works
bucketed_timestamps = torch.clamp(
self._bucketization_fn(
ext_timestamps[:, 1:].unsqueeze(2) - ext_timestamps[:, :-1].unsqueeze(1)
),
min=0,
max=self._num_buckets,
).detach()
rel_pos_bias = t[:, :, r:-r]
rel_ts_bias = torch.index_select(
self._ts_w, dim=0, index=bucketed_timestamps.view(-1)
).view(B, N, N)
return rel_pos_bias + rel_ts_bias
HSTUCacheState = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
def _hstu_attention_maybe_from_cache(
num_heads: int,
attention_dim: int,
linear_dim: int,
q: torch.Tensor,
k: torch.Tensor,
v: torch.Tensor,
attention_mask: torch.Tensor # [bs, 1, n, n]
):
B, _, n, _ = attention_mask.size()
qk_attn = torch.einsum(
"bnhd,bmhd->bhnm",
q.view(B, n, num_heads, attention_dim),
k.view(B, n, num_heads, attention_dim),
)
qk_attn = F.silu(qk_attn) / n
qk_attn = qk_attn * attention_mask
# print(f"{qk_attn.size() = } {v.size() = }")
attn_output = torch.einsum(
"bhnm,bmhd->bnhd",
qk_attn,
v.reshape(B, n, num_heads, linear_dim),
).reshape(B, n, num_heads * linear_dim)
return attn_output
class SequentialTransductionUnitJagged(torch.nn.Module):
def __init__(
self,
embedding_dim: int,
linear_hidden_dim: int,
attention_dim: int,
dropout_ratio: float,
attn_dropout_ratio: float,
num_heads: int,
linear_activation: str,
relative_attention_bias_module: Optional[RelativeAttentionBiasModule] = None,
normalization: str = "rel_bias",
linear_config: str = "uvqk",
concat_ua: bool = False,
epsilon: float = 1e-6,
max_length: Optional[int] = None,
) -> None:
super().__init__()
self._embedding_dim: int = embedding_dim
self._linear_dim: int = linear_hidden_dim
self._attention_dim: int = attention_dim
self._dropout_ratio: float = dropout_ratio
self._attn_dropout_ratio: float = attn_dropout_ratio
self._num_heads: int = num_heads
self._rel_attn_bias: Optional[RelativeAttentionBiasModule] = (
relative_attention_bias_module
)
self._normalization: str = normalization
self._linear_config: str = linear_config
if self._linear_config == "uvqk":
self._uvqk = torch.nn.Parameter(
torch.empty(
(
embedding_dim,
linear_hidden_dim * 2 * num_heads
+ attention_dim * num_heads * 2,
)
).normal_(mean=0, std=0.02),
)
else:
raise ValueError(f"Unknown linear_config {self._linear_config}")
self._linear_activation: str = linear_activation
self._concat_ua: bool = concat_ua
self._o = torch.nn.Linear(
in_features=linear_hidden_dim * num_heads * (3 if concat_ua else 1),
out_features=embedding_dim,
)
torch.nn.init.xavier_uniform_(self._o.weight)
self._eps: float = epsilon
def _norm_input(self, x: torch.Tensor) -> torch.Tensor:
return F.layer_norm(x, normalized_shape=[self._embedding_dim], eps=self._eps)
def _norm_attn_output(self, x: torch.Tensor) -> torch.Tensor:
return F.layer_norm(
x, normalized_shape=[self._linear_dim * self._num_heads], eps=self._eps
)
def forward(
self,
x: torch.Tensor,
attention_mask: torch.Tensor
) -> torch.Tensor:
"""
Args:
x: (\sum_i N_i, D) x float.
x_offsets: (B + 1) x int32.
all_timestamps: optional (B, N) x int64.
invalid_attn_mask: (B, N, N) x float, each element in {0, 1}.
delta_x_offsets: optional 2-tuple ((B,) x int32, (B,) x int32).
For the 1st element in the tuple, each element is in [0, x_offsets[-1]). For the
2nd element in the tuple, each element is in [0, N).
cache: Optional 4-tuple of (v, padded_q, padded_k, output) from prior runs,
where all except padded_q, padded_k are jagged.
Returns:
x' = f(x), (\sum_i N_i, D) x float.
"""
normed_x = self._norm_input(x)
if self._linear_config == "uvqk":
batched_mm_output = torch.matmul(normed_x, self._uvqk)
if self._linear_activation == "silu":
batched_mm_output = F.silu(batched_mm_output)
elif self._linear_activation == "none":
batched_mm_output = batched_mm_output
u, v, q, k = torch.split(
batched_mm_output,
[
self._linear_dim * self._num_heads,
self._linear_dim * self._num_heads,
self._attention_dim * self._num_heads,
self._attention_dim * self._num_heads,
],
dim=-1,
)
else:
raise ValueError(f"Unknown self._linear_config {self._linear_config}")
B: int = attention_mask.size(0)
if self._normalization == "rel_bias" or self._normalization == "hstu_rel_bias":
attn_output = _hstu_attention_maybe_from_cache(
num_heads=self._num_heads,
attention_dim=self._attention_dim,
linear_dim=self._linear_dim,
q=q,
k=k,
v=v,
attention_mask=attention_mask
)
if self._concat_ua:
a = self._norm_attn_output(attn_output)
o_input = torch.cat([u, a, u * a], dim=-1)
else:
o_input = u * self._norm_attn_output(attn_output)
new_outputs = (
self._o(
F.dropout(
o_input,
p=self._dropout_ratio,
training=self.training,
)
)
+ x
)
return new_outputs
class HSTUJagged(torch.nn.Module):
def __init__(
self,
modules: List[SequentialTransductionUnitJagged],
autocast_dtype: torch.dtype,
) -> None:
super().__init__()
self._attention_layers: torch.nn.ModuleList = torch.nn.ModuleList(
modules=modules
)
self._autocast_dtype: torch.dtype = autocast_dtype
def forward(
self,
x: torch.Tensor,
attention_mask: torch.Tensor,
) -> Tuple[torch.Tensor, List[HSTUCacheState]]:
"""
Args:
x: (B, N, D) x float.
x_offsets: (B + 1) x int32.
all_timestamps: (B, 1 + N) x int64
invalid_attn_mask: (B, N, N) x float, each element in {0, 1}.
Returns:
x' = f(x), (B, N, D) x float
"""
for i, layer in enumerate(self._attention_layers):
x = layer(
x=x,
attention_mask=attention_mask
)
return x
class HSTU(BaseModel):
"""
Implements HSTU (Hierarchical Sequential Transduction Unit) in
Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations,
https://arxiv.org/abs/2402.17152.
Note that this implementation is intended for reproducing experiments in
the traditional sequential recommender setting (Section 4.1.1), and does
not yet use optimized kernels discussed in the paper.
"""
input_type = InputType.SEQ
def __init__(self, config, dataload):
super().__init__()
self.logger = getLogger()
self.item_num = dataload.item_num
self._item_embedding_dim: int = config['item_embedding_size']
self._hstu_embedding_dim: int = config['hstu_embedding_size']
self._max_sequence_length: int = config['MAX_ITEM_LIST_LENGTH']
self._num_blocks: int = config['n_layers']
self._num_heads: int = config['n_heads']
self._dqk: int = config['hstu_embedding_size'] // config['n_heads']
self._dv: int = config['hstu_embedding_size'] // config['n_heads']
self._linear_activation: str = config['hidden_act'] if config['hidden_act'] else "silu"
self._linear_dropout_rate: float = config['hidden_dropout_prob']
self._attn_dropout_rate: float = config['attn_dropout_prob']
self._enable_relative_attention_bias: bool = config['enable_relative_attention_bias'] if config['enable_relative_attention_bias'] else False
self._linear_config = 'uvqk'
self._normalization = 'rel_bias'
self.position_embedding = nn.Embedding(self._max_sequence_length+1, self._hstu_embedding_dim)
self._hstu = HSTUJagged(
modules=[
SequentialTransductionUnitJagged(
embedding_dim=self._hstu_embedding_dim,
linear_hidden_dim=self._dv,
attention_dim=self._dqk,
normalization=self._normalization,
linear_config=self._linear_config,
linear_activation=self._linear_activation,
num_heads=self._num_heads,
# TODO: change to lambda x.
relative_attention_bias_module=(
RelativeBucketedTimeAndPositionBasedBias(
max_seq_len=self._max_sequence_length
+ self._max_sequence_length, # accounts for next item.
num_buckets=128,
bucketization_fn=lambda x: (
torch.log(torch.abs(x).clamp(min=1)) / 0.301
).long(),
)
if self._enable_relative_attention_bias
else None
),
dropout_ratio=self._linear_dropout_rate,
attn_dropout_ratio=self._attn_dropout_rate,
concat_ua=False,
)
for _ in range(self._num_blocks)
],
autocast_dtype=None,
)
self.item_embedding = nn.Embedding(self.item_num, self._item_embedding_dim, padding_idx=0)
self.item_id_proj_tower = nn.Identity() if config['item_embedding_size'] == config['hstu_embedding_size'] else nn.Linear(config['item_embedding_size'], config['hstu_embedding_size'], bias=False)
self.loss = config['loss']
if self.loss == 'nce':
if config['fix_temp']:
self.logger.info(f"Fixed logit_scale 20")
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.05), requires_grad=False)
else:
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
self.nce_thres = config['nce_thres'] if config['nce_thres'] else 0.99
self.num_negatives = config['num_negatives']
self.logger.info(f"nce thres setting to {self.nce_thres}")
else:
raise NotImplementedError(f"Only nce is supported")
# causal forward, w/ +1 for padding.
self.register_buffer(
"_attn_mask",
torch.triu(
torch.ones(
(
self._max_sequence_length,
self._max_sequence_length,
),
dtype=torch.bool,
),
diagonal=1,
),
)
self._verbose: bool = True
self.reset_params()
def reset_params(self):
for name, params in self.named_parameters():
if ("_hstu" in name) or ("_embedding_module" in name) or ('logit_scale' in name):
if self._verbose:
print(f"Skipping init for {name}")
continue
try:
truncated_normal(params.data, mean=0.0, std=0.02)
if self._verbose:
print(
f"Initialize {name} as trunc normal: {params.data.size()} params"
)
except:
if self._verbose:
print(f"Failed to initialize {name}: {params.data.size()} params")
def debug_str(self) -> str:
debug_str = (
f"HSTU-b{self._num_blocks}-h{self._num_heads}-dqk{self._dqk}-dv{self._dv}"
+ f"-l{self._linear_activation}d{self._linear_dropout_rate}"
+ f"-ad{self._attn_dropout_rate}"
)
if not self._enable_relative_attention_bias:
debug_str += "-norab"
return debug_str
def forward(self, interaction):
items, neg_items, masked_index = interaction # [batch, 2, seq_len] #[batch, max_seq_len-1]
if self.num_negatives:
neg_items = torch.randint(
low=1,
high=self.item_num,
size=(items.size(0), items.size(1) - 1, self.num_negatives),
dtype=items.dtype,
device=items.device,
)
pos_items_embs = self.item_id_proj_tower(self.item_embedding(items)) # [batch, 2, max_seq_len+1, dim]
neg_items_embs = self.item_id_proj_tower(self.item_embedding(neg_items)) # [128, 200, 1024, 50]
input_emb = pos_items_embs[:, :-1, :] # [batch, max_seq_len, dim]
position_ids = torch.arange(masked_index.size(1), dtype=torch.long, device=masked_index.device)
position_ids = position_ids.unsqueeze(0).expand_as(masked_index)
position_embedding = self.position_embedding(position_ids)
input_emb = input_emb + position_embedding
attention_mask = self.get_attention_mask(masked_index)
output_embs = self._hstu(
x=input_emb,
attention_mask=attention_mask
)
target_pos_embs = pos_items_embs[:, 1:, :] # [batch, max_seq_len, dim]
neg_embedding_all = neg_items_embs # [batch, max_seq_len, dim]
with torch.no_grad():
self.logit_scale.clamp_(0, np.log(100))
logit_scale = self.logit_scale.exp()
output_embs = output_embs / output_embs.norm(dim=-1, keepdim=True)
target_pos_embs = target_pos_embs / target_pos_embs.norm(dim=-1, keepdim=True)
neg_embedding_all = neg_embedding_all / neg_embedding_all.norm(dim=-1, keepdim=True)
pos_logits = F.cosine_similarity(output_embs, target_pos_embs, dim=-1).unsqueeze(-1)
if self.num_negatives:
neg_logits = F.cosine_similarity(output_embs.unsqueeze(2), neg_embedding_all, dim=-1)
fix_logits = F.cosine_similarity(target_pos_embs.unsqueeze(2), neg_embedding_all, dim=-1)
else:
D = neg_embedding_all.size(-1)
neg_embedding_all = all_gather(neg_embedding_all, sync_grads=True).reshape(-1, D) # [num, dim]
neg_embedding_all = neg_embedding_all.transpose(-1, -2)
neg_logits = torch.matmul(output_embs, neg_embedding_all)
fix_logits = torch.matmul(target_pos_embs, neg_embedding_all)
neg_logits[fix_logits > self.nce_thres] = torch.finfo(neg_logits.dtype).min
logits = torch.cat([pos_logits, neg_logits], dim=-1)
logits = logits[masked_index.bool()] * logit_scale
labels = torch.zeros(logits.size(0), device=logits.device, dtype=torch.int64)
model_out = {}
model_out['loss'] = F.cross_entropy(logits, labels)
model_out['nce_samples'] = (logits > torch.finfo(logits.dtype).min/100).sum(dim=1).float().mean()
for k in [1, 5, 10, 50, 100]:
if k > logits.size(1):
break
indices = logits.topk(k, dim=1).indices
model_out[f"nce_top{k}_acc"] = labels.view(-1, 1).eq(indices).any(dim=1).float().mean()
return model_out
@torch.no_grad()
def predict(self, item_seq, time_seq, item_feature):
position_ids = torch.arange(item_seq.size(1), dtype=torch.long, device=item_seq.device)
position_ids = position_ids.unsqueeze(0).expand_as(item_seq)
position_embedding = self.position_embedding(position_ids)
item_emb = self.item_id_proj_tower(self.item_embedding(item_seq))
item_emb = item_emb + position_embedding
attention_mask = self.get_attention_mask(item_seq)
output_embs = self._hstu(
x=item_emb,
attention_mask=attention_mask
)
seq_output = output_embs[:, -1]
seq_output = seq_output / seq_output.norm(dim=-1, keepdim=True)
scores = torch.matmul(seq_output, item_feature.t())
return scores
@torch.no_grad()
def compute_item_all(self):
weight = self.item_id_proj_tower(self.item_embedding.weight)
return weight / weight.norm(dim=-1, keepdim=True)
def get_attention_mask(self, item_seq, bidirectional=False):
"""Generate left-to-right uni-directional or bidirectional attention mask for multi-head attention."""
attention_mask = (item_seq != 0)
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # torch.bool
if not bidirectional:
extended_attention_mask = torch.tril(extended_attention_mask.expand((-1, -1, item_seq.size(-1), -1)))
# extended_attention_mask = torch.where(extended_attention_mask, 0., -1e9)
return extended_attention_mask
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import transformers
from transformers import AutoConfig, AutoModelForCausalLM
from logging import getLogger
from REC.utils.enum_type import InputType
from REC.model.basemodel import BaseModel, all_gather
from REC.model.HLLM.modeling_llama import LlamaForCausalLM
from REC.model.HLLM.modeling_bert import BertModel
class LLMIDRec(BaseModel):
input_type = InputType.SEQ
def __init__(self, config, dataload):
super(LLMIDRec, self).__init__()
self.logger = getLogger()
self.user_pretrain_dir = config['user_pretrain_dir']
self.gradient_checkpointing = config['gradient_checkpointing']
self.use_ft_flash_attn = config['use_ft_flash_attn']
self.logger.info(f"create user llm")
self.user_llm = self.create_llm(self.user_pretrain_dir, config['user_llm_init'])
self.item_num = dataload.item_num
self.item_embedding = nn.Embedding(self.item_num, config['item_embed_dim'], padding_idx=0)
self.item_id_proj_tower = nn.Identity() if config['item_embed_dim'] == self.user_llm.config.hidden_size else nn.Linear(config['item_embed_dim'], self.user_llm.config.hidden_size, bias=False)
self.item_embedding.weight.data.normal_(mean=0.0, std=0.02)
self.loss = config['loss']
if self.loss == 'nce':
if config['fix_temp']:
self.logger.info(f"Fixed logit_scale 20")
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.05), requires_grad=False)
else:
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
self.nce_thres = config['nce_thres'] if config['nce_thres'] else 0.99
self.num_negatives = config['num_negatives']
self.logger.info(f"nce thres setting to {self.nce_thres}")
else:
raise NotImplementedError(f"Only nce is supported")
def create_llm(self, pretrain_dir, init=True):
self.logger.info(f"******* create LLM {pretrain_dir} *******")
hf_config = AutoConfig.from_pretrained(pretrain_dir, trust_remote_code=True)
self.logger.info(f"hf_config: {hf_config}")
hf_config.gradient_checkpointing = self.gradient_checkpointing
hf_config.use_cache = False
hf_config.output_hidden_states = True
hf_config.return_dict = True
self.logger.info("xxxxx starting loading checkpoint")
if isinstance(hf_config, transformers.LlamaConfig):
hf_config.use_ft_flash_attn = self.use_ft_flash_attn
self.logger.info(f'Using flash attention {hf_config.use_ft_flash_attn} for llama')
self.logger.info(f'Init {init} for llama')
if init:
return LlamaForCausalLM.from_pretrained(pretrain_dir, config=hf_config)
else:
return LlamaForCausalLM(config=hf_config).bfloat16()
elif isinstance(hf_config, transformers.BertConfig):
hf_config.use_ft_flash_attn = self.use_ft_flash_attn
self.logger.info(f'Using flash attention {hf_config.use_ft_flash_attn} for bert')
self.logger.info(f'Init {init} for bert')
if init:
return BertModel.from_pretrained(pretrain_dir, config=hf_config)
else:
return BertModel(config=hf_config).bfloat16()
else:
return AutoModelForCausalLM.from_pretrained(
self.local_dir, config=hf_config
)
def forward(self, interaction):
items, neg_items, masked_index = interaction # [batch, 2, seq_len] #[batch, max_seq_len-1]
if self.num_negatives:
neg_items = torch.randint(
low=1,
high=self.item_num,
size=(items.size(0), items.size(1) - 1, self.num_negatives),
dtype=items.dtype,
device=items.device,
)
pos_items_embs = self.item_id_proj_tower(self.item_embedding(items)) # [batch, 2, max_seq_len+1, dim]
neg_items_embs = self.item_id_proj_tower(self.item_embedding(neg_items)) # [batch, 2, max_seq_len+1, dim]
input_emb = pos_items_embs[:, :-1, :] # [batch, max_seq_len, dim]
target_pos_embs = pos_items_embs[:, 1:, :] # [batch, max_seq_len, dim]
neg_embedding_all = neg_items_embs # [batch, max_seq_len, dim]
output_embs = self.user_llm(inputs_embeds=input_emb, attention_mask=masked_index).hidden_states[-1]
with torch.no_grad():
self.logit_scale.clamp_(0, np.log(100))
logit_scale = self.logit_scale.exp()
output_embs = output_embs / output_embs.norm(dim=-1, keepdim=True)
target_pos_embs = target_pos_embs / target_pos_embs.norm(dim=-1, keepdim=True)
neg_embedding_all = neg_embedding_all / neg_embedding_all.norm(dim=-1, keepdim=True)
pos_logits = F.cosine_similarity(output_embs, target_pos_embs, dim=-1).unsqueeze(-1)
if self.num_negatives:
neg_logits = F.cosine_similarity(output_embs.unsqueeze(2), neg_embedding_all, dim=-1)
fix_logits = F.cosine_similarity(target_pos_embs.unsqueeze(2), neg_embedding_all, dim=-1)
else:
D = neg_embedding_all.size(-1)
neg_embedding_all = all_gather(neg_embedding_all, sync_grads=True).reshape(-1, D) # [num, dim]
neg_embedding_all = neg_embedding_all.transpose(-1, -2)
neg_logits = torch.matmul(output_embs, neg_embedding_all)
fix_logits = torch.matmul(target_pos_embs, neg_embedding_all)
neg_logits[fix_logits > self.nce_thres] = torch.finfo(neg_logits.dtype).min
logits = torch.cat([pos_logits, neg_logits], dim=-1)
logits = logits[masked_index.bool()] * logit_scale
labels = torch.zeros(logits.size(0), device=logits.device, dtype=torch.int64)
model_out = {}
model_out['loss'] = F.cross_entropy(logits, labels)
model_out['nce_samples'] = (logits > torch.finfo(logits.dtype).min/100).sum(dim=1).float().mean()
for k in [1, 5, 10, 50, 100]:
if k > logits.size(1):
break
indices = logits.topk(k, dim=1).indices
model_out[f"nce_top{k}_acc"] = labels.view(-1, 1).eq(indices).any(dim=1).float().mean()
return model_out
@torch.no_grad()
def predict(self, item_seq, time_seq, item_feature):
item_emb = self.item_id_proj_tower(self.item_embedding(item_seq))
attention_mask = (item_seq > 0).int()
output_embs = self.user_llm(inputs_embeds=item_emb, attention_mask=attention_mask).hidden_states[-1]
seq_output = output_embs[:, -1]
seq_output = seq_output / seq_output.norm(dim=-1, keepdim=True)
scores = torch.matmul(seq_output, item_feature.t())
return scores
@torch.no_grad()
def compute_item_all(self):
weight = self.item_id_proj_tower(self.item_embedding(torch.arange(self.item_num, device=self.item_embedding.weight.device)))
return weight / weight.norm(dim=-1, keepdim=True)
def get_attention_mask(self, item_seq, bidirectional=False):
"""Generate left-to-right uni-directional or bidirectional attention mask for multi-head attention."""
attention_mask = (item_seq != 0)
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # torch.bool
if not bidirectional:
extended_attention_mask = torch.tril(extended_attention_mask.expand((-1, -1, item_seq.size(-1), -1)))
extended_attention_mask = torch.where(extended_attention_mask, 0., -1e9)
return extended_attention_mask
# Copyright (c) 2024 westlake-repl
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# SPDX-License-Identifier: MIT
# This file has been modified by Junyi Chen.
#
# Original file was released under MIT, with the full license text
# available at https://choosealicense.com/licenses/mit/.
#
# This modified file is released under the same license.
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
from logging import getLogger
from REC.model.layers import TransformerEncoder
from REC.utils.enum_type import InputType
from REC.model.basemodel import BaseModel, all_gather
class SASRec(BaseModel):
input_type = InputType.SEQ
def __init__(self, config, dataload):
super(SASRec, self).__init__()
self.logger = getLogger()
# load parameters info
self.n_layers = config['n_layers']
self.n_heads = config['n_heads']
self.hidden_size = config['embedding_size'] # same as embedding_size
self.inner_size = config['inner_size'] # the dimensionality in feed-forward layer
self.inner_size *= self.hidden_size
self.hidden_dropout_prob = config['hidden_dropout_prob']
self.attn_dropout_prob = config['attn_dropout_prob']
self.hidden_act = config['hidden_act']
self.layer_norm_eps = config['layer_norm_eps']
self.initializer_range = config['initializer_range']
self.max_seq_length = config['MAX_ITEM_LIST_LENGTH']
self.item_num = dataload.item_num
# define layers and loss
self.item_embedding = nn.Embedding(self.item_num, self.hidden_size, padding_idx=0)
self.position_embedding = nn.Embedding(self.max_seq_length, self.hidden_size)
self.trm_encoder = TransformerEncoder(
n_layers=self.n_layers,
n_heads=self.n_heads,
hidden_size=self.hidden_size,
inner_size=self.inner_size,
hidden_dropout_prob=self.hidden_dropout_prob,
attn_dropout_prob=self.attn_dropout_prob,
hidden_act=self.hidden_act,
layer_norm_eps=self.layer_norm_eps
)
self.LayerNorm = nn.LayerNorm(self.hidden_size, eps=self.layer_norm_eps)
self.dropout = nn.Dropout(self.hidden_dropout_prob)
self.loss = config['loss']
if self.loss == 'nce':
if config['fix_temp']:
self.logger.info(f"Fixed logit_scale 20")
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.05), requires_grad=False)
else:
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
self.nce_thres = config['nce_thres'] if config['nce_thres'] else 0.99
self.num_negatives = config['num_negatives']
self.logger.info(f"nce thres setting to {self.nce_thres}")
else:
raise NotImplementedError(f"Only nce is supported")
# parameters initialization
self.apply(self._init_weights)
def _init_weights(self, module):
""" Initialize the weights """
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.initializer_range)
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
def forward(self, interaction):
items, neg_items, masked_index = interaction # [batch, 2, seq_len] #[batch, max_seq_len-1]
if self.num_negatives:
neg_items = torch.randint(
low=1,
high=self.item_num,
size=(items.size(0), items.size(1) - 1, self.num_negatives),
dtype=items.dtype,
device=items.device,
)
pos_items_embs = self.item_embedding(items) # [batch, 2, max_seq_len+1, dim]
neg_items_embs = self.item_embedding(neg_items) # [batch, 2, max_seq_len+1, dim]
input_emb = pos_items_embs[:, :-1, :] # [batch, max_seq_len, dim]
target_pos_embs = pos_items_embs[:, 1:, :] # [batch, max_seq_len, dim]
neg_embedding_all = neg_items_embs # [batch, max_seq_len, dim]
position_ids = torch.arange(masked_index.size(1), dtype=torch.long, device=masked_index.device)
position_ids = position_ids.unsqueeze(0).expand_as(masked_index)
position_embedding = self.position_embedding(position_ids)
input_emb = input_emb + position_embedding
input_emb = self.LayerNorm(input_emb)
input_emb = self.dropout(input_emb)
extended_attention_mask = self.get_attention_mask(masked_index, bidirectional=False)
output_embs = self.trm_encoder(input_emb, extended_attention_mask, output_all_encoded_layers=False) # [batch, max_seq_len-1, dim]
output_embs = output_embs[-1]
with torch.no_grad():
self.logit_scale.clamp_(0, np.log(100))
logit_scale = self.logit_scale.exp()
output_embs = output_embs / output_embs.norm(dim=-1, keepdim=True)
target_pos_embs = target_pos_embs / target_pos_embs.norm(dim=-1, keepdim=True)
neg_embedding_all = neg_embedding_all / neg_embedding_all.norm(dim=-1, keepdim=True)
pos_logits = F.cosine_similarity(output_embs, target_pos_embs, dim=-1).unsqueeze(-1)
if self.num_negatives:
neg_logits = F.cosine_similarity(output_embs.unsqueeze(2), neg_embedding_all, dim=-1)
fix_logits = F.cosine_similarity(target_pos_embs.unsqueeze(2), neg_embedding_all, dim=-1)
else:
D = neg_embedding_all.size(-1)
neg_embedding_all = all_gather(neg_embedding_all, sync_grads=True).reshape(-1, D) # [num, dim]
neg_embedding_all = neg_embedding_all.transpose(-1, -2)
neg_logits = torch.matmul(output_embs, neg_embedding_all)
fix_logits = torch.matmul(target_pos_embs, neg_embedding_all)
neg_logits[fix_logits > self.nce_thres] = torch.finfo(neg_logits.dtype).min
logits = torch.cat([pos_logits, neg_logits], dim=-1)
logits = logits[masked_index.bool()] * logit_scale
labels = torch.zeros(logits.size(0), device=logits.device, dtype=torch.int64)
model_out = {}
model_out['loss'] = F.cross_entropy(logits, labels)
model_out['nce_samples'] = (logits > torch.finfo(logits.dtype).min/100).sum(dim=1).float().mean()
for k in [1, 5, 10, 50, 100]:
if k > logits.size(1):
break
indices = logits.topk(k, dim=1).indices
model_out[f"nce_top{k}_acc"] = labels.view(-1, 1).eq(indices).any(dim=1).float().mean()
return model_out
@torch.no_grad()
def predict(self, item_seq, time_seq, item_feature):
position_ids = torch.arange(item_seq.size(1), dtype=torch.long, device=item_seq.device)
position_ids = position_ids.unsqueeze(0).expand_as(item_seq)
position_embedding = self.position_embedding(position_ids)
item_emb = self.item_embedding(item_seq)
input_emb = item_emb + position_embedding
input_emb = self.LayerNorm(input_emb)
input_emb = self.dropout(input_emb)
extended_attention_mask = self.get_attention_mask(item_seq, bidirectional=False)
output = self.trm_encoder(input_emb, extended_attention_mask, output_all_encoded_layers=False)
output_embs = output[-1]
seq_output = output_embs[:, -1]
seq_output = seq_output / seq_output.norm(dim=-1, keepdim=True)
scores = torch.matmul(seq_output, item_feature.t())
return scores
@torch.no_grad()
def compute_item_all(self):
weight = self.item_embedding.weight
return weight / weight.norm(dim=-1, keepdim=True)
def get_attention_mask(self, item_seq, bidirectional=False):
"""Generate left-to-right uni-directional or bidirectional attention mask for multi-head attention."""
attention_mask = (item_seq != 0)
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # torch.bool
if not bidirectional:
extended_attention_mask = torch.tril(extended_attention_mask.expand((-1, -1, item_seq.size(-1), -1)))
extended_attention_mask = torch.where(extended_attention_mask, 0., -1e9)
return extended_attention_mask
# Copyright (c) 2024 westlake-repl
# SPDX-License-Identifier: MIT
import numpy as np
import torch
import torch.nn as nn
from REC.utils import set_color
def all_gather(data,
group=None,
sync_grads=False):
group = group if group is not None else torch.distributed.group.WORLD
if torch.distributed.get_world_size() > 1:
from torch.distributed import nn
if sync_grads:
return torch.stack(nn.functional.all_gather(data, group=group), dim=0)
with torch.no_grad():
return torch.stack(nn.functional.all_gather(data, group=group), dim=0)
else:
return data.unsqueeze(0)
def l2_norm(x, eps=1e-6):
x = x / torch.clamp(
torch.linalg.norm(x, ord=2, dim=-1, keepdim=True),
min=eps,
)
return x
class BaseModel(nn.Module):
def __init__(self):
super(BaseModel, self).__init__()
def load_weights(self, path):
checkpoint = torch.load(path, map_location='cpu')
pretrained_dicts = checkpoint['state_dict']
self.load_state_dict({k.replace('item_embedding.rec_fc', 'visual_encoder.item_encoder.fc'): v for k, v in pretrained_dicts.items()}, strict=False)
def __str__(self):
"""
Model prints with number of trainable parameters
"""
model_parameters = filter(lambda p: p.requires_grad, self.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
return super().__str__() + set_color('\nTrainable parameters', 'blue') + f': {params}'
# Copyright (c) 2024 westlake-repl
# SPDX-License-Identifier: MIT
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as fn
from torch.nn.init import normal_
import copy
# [64, 128, 512] -> (64,128), (128,512)
# [64]
class MLPLayers(nn.Module):
r""" MLPLayers
Args:
- layers(list): a list contains the size of each layer in mlp layers
- dropout(float): probability of an element to be zeroed. Default: 0
- activation(str): activation function after each layer in mlp layers. Default: 'relu'.
candidates: 'sigmoid', 'tanh', 'relu', 'leekyrelu', 'none'
Shape:
- Input: (:math:`N`, \*, :math:`H_{in}`) where \* means any number of additional dimensions
:math:`H_{in}` must equal to the first value in `layers`
- Output: (:math:`N`, \*, :math:`H_{out}`) where :math:`H_{out}` equals to the last value in `layers`
Examples::
>>> m = MLPLayers([64, 32, 16], 0.2, 'relu')
>>> input = torch.randn(128, 64)
>>> output = m(input)
>>> print(output.size())
>>> torch.Size([128, 16])
"""
def __init__(self, layers, dropout=0., activation='relu', bn=False, init_method=None):
super(MLPLayers, self).__init__()
self.layers = layers
self.dropout = dropout
self.activation = activation
self.use_bn = bn
self.init_method = init_method
mlp_modules = []
for idx, (input_size, output_size) in enumerate(zip(self.layers[:-1], self.layers[1:])):
mlp_modules.append(nn.Dropout(p=self.dropout))
mlp_modules.append(nn.Linear(input_size, output_size))
if self.use_bn:
mlp_modules.append(nn.BatchNorm1d(num_features=output_size))
activation_func = activation_layer(self.activation, output_size)
if activation_func is not None:
mlp_modules.append(activation_func)
self.mlp_layers = nn.Sequential(*mlp_modules)
if self.init_method is not None:
self.apply(self.init_weights)
def init_weights(self, module):
# We just initialize the module with normal distribution as the paper said
if isinstance(module, nn.Linear):
if self.init_method == 'norm':
normal_(module.weight.data, 0, 0.01)
if module.bias is not None:
module.bias.data.fill_(0.0)
def forward(self, input_feature):
return self.mlp_layers(input_feature)
def activation_layer(activation_name='relu', emb_dim=None):
"""Construct activation layers
Args:
activation_name: str, name of activation function
emb_dim: int, used for Dice activation
Return:
activation: activation layer
"""
if activation_name is None:
activation = None
elif isinstance(activation_name, str):
if activation_name.lower() == 'sigmoid':
activation = nn.Sigmoid()
elif activation_name.lower() == 'tanh':
activation = nn.Tanh()
elif activation_name.lower() == 'relu':
activation = nn.ReLU()
elif activation_name.lower() == 'leakyrelu':
activation = nn.LeakyReLU()
elif activation_name.lower() == 'dice':
activation = Dice(emb_dim)
elif activation_name.lower() == 'none':
activation = None
elif issubclass(activation_name, nn.Module):
activation = activation_name()
else:
raise NotImplementedError("activation function {} is not implemented".format(activation_name))
return activation
class FMEmbedding(nn.Module):
r""" Embedding for token fields.
Args:
field_dims: list, the number of tokens in each token fields
offsets: list, the dimension offset of each token field
embed_dim: int, the dimension of output embedding vectors
Input:
input_x: tensor, A 3D tensor with shape:``(batch_size,field_size)``.
Return:
output: tensor, A 3D tensor with shape: ``(batch_size,field_size,embed_dim)``.
"""
def __init__(self, field_dims, offsets, embed_dim):
super(FMEmbedding, self).__init__()
self.embedding = nn.Embedding(sum(field_dims), embed_dim)
self.offsets = offsets
def forward(self, input_x):
input_x = input_x + input_x.new_tensor(self.offsets).unsqueeze(0)
output = self.embedding(input_x)
return output
class BaseFactorizationMachine(nn.Module):
r"""Calculate FM result over the embeddings
Args:
reduce_sum: bool, whether to sum the result, default is True.
Input:
input_x: tensor, A 3D tensor with shape:``(batch_size,field_size,embed_dim)``.
Output
output: tensor, A 3D tensor with shape: ``(batch_size,1)`` or ``(batch_size, embed_dim)``.
"""
def __init__(self, reduce_sum=True):
super(BaseFactorizationMachine, self).__init__()
self.reduce_sum = reduce_sum
def forward(self, input_x):
square_of_sum = torch.sum(input_x, dim=1) ** 2
sum_of_square = torch.sum(input_x ** 2, dim=1)
output = square_of_sum - sum_of_square
if self.reduce_sum:
output = torch.sum(output, dim=1, keepdim=True)
output = 0.5 * output
return output
class BiGNNLayer(nn.Module):
r"""Propagate a layer of Bi-interaction GNN
.. math::
output = (L+I)EW_1 + LE \otimes EW_2
"""
def __init__(self, in_dim, out_dim):
super(BiGNNLayer, self).__init__()
self.in_dim = in_dim
self.out_dim = out_dim
self.linear = torch.nn.Linear(in_features=in_dim, out_features=out_dim)
self.interActTransform = torch.nn.Linear(in_features=in_dim, out_features=out_dim)
def forward(self, lap_matrix, eye_matrix, features):
# for GCF ajdMat is a (N+M) by (N+M) mat
# lap_matrix L = D^-1(A)D^-1 # 拉普拉斯矩阵
x = torch.sparse.mm(lap_matrix, features)
inter_part1 = self.linear(features + x)
inter_feature = torch.mul(x, features)
inter_part2 = self.interActTransform(inter_feature)
return inter_part1 + inter_part2
class AttLayer(nn.Module):
"""Calculate the attention signal(weight) according the input tensor.
Args:
infeatures (torch.FloatTensor): A 3D input tensor with shape of[batch_size, M, embed_dim].
Returns:
torch.FloatTensor: Attention weight of input. shape of [batch_size, M].
"""
def __init__(self, in_dim, att_dim):
super(AttLayer, self).__init__()
self.in_dim = in_dim
self.att_dim = att_dim
self.w = torch.nn.Linear(in_features=in_dim, out_features=att_dim, bias=False)
self.h = nn.Parameter(torch.randn(att_dim), requires_grad=True)
def forward(self, infeatures):
att_signal = self.w(infeatures) # [batch_size, M, att_dim]
att_signal = fn.relu(att_signal) # [batch_size, M, att_dim]
att_signal = torch.mul(att_signal, self.h) # [batch_size, M, att_dim]
att_signal = torch.sum(att_signal, dim=2) # [batch_size, M]
att_signal = fn.softmax(att_signal, dim=1) # [batch_size, M]
return att_signal
class Dice(nn.Module):
r"""Dice activation function
.. math::
f(s)=p(s) \cdot s+(1-p(s)) \cdot \alpha s
.. math::
p(s)=\frac{1} {1 + e^{-\frac{s-E[s]} {\sqrt {Var[s] + \epsilon}}}}
"""
def __init__(self, emb_size):
super(Dice, self).__init__()
self.sigmoid = nn.Sigmoid()
self.alpha = torch.zeros((emb_size,))
def forward(self, score):
self.alpha = self.alpha.to(score.device)
score_p = self.sigmoid(score)
return self.alpha * (1 - score_p) * score + score_p * score
class SequenceAttLayer(nn.Module):
"""Attention Layer. Get the representation of each user in the batch.
Args:
queries (torch.Tensor): candidate ads, [B, H], H means embedding_size * feat_num
keys (torch.Tensor): user_hist, [B, T, H]
keys_length (torch.Tensor): mask, [B]
Returns:
torch.Tensor: result
"""
def __init__(
self, att_hidden_size=(80, 40), activation='sigmoid', softmax_stag=False, return_seq_weight=True
):
super(SequenceAttLayer, self).__init__()
self.att_hidden_size = att_hidden_size
self.activation = activation
self.softmax_stag = softmax_stag
self.return_seq_weight = return_seq_weight
self.att_mlp_layers = MLPLayers(self.att_hidden_size, activation='Sigmoid', bn=False)
self.dense = nn.Linear(self.att_hidden_size[-1], 1)
def forward(self, queries, keys, mask):
embedding_size = queries.shape[-1] # H
hist_len = keys.shape[1] # T
queries = queries.repeat(1, hist_len)
queries = queries.view(-1, hist_len, embedding_size)
# MLP Layer
input_tensor = torch.cat([queries, keys, queries - keys, queries * keys], dim=-1)
output = self.att_mlp_layers(input_tensor)
output = torch.transpose(self.dense(output), -1, -2)
# get mask
output = output.squeeze(1)
# mask
if self.softmax_stag:
mask_value = -np.inf
else:
mask_value = 0.0
output = output.masked_fill(mask=mask, value=torch.tensor(mask_value))
output = output.unsqueeze(1)
output = output / (embedding_size ** 0.5)
# get the weight of each user's history list about the target item
if self.softmax_stag:
output = fn.softmax(output, dim=2) # [B, 1, T]
if not self.return_seq_weight:
output = torch.matmul(output, keys) # [B, 1, H]
return output
class VanillaAttention(nn.Module):
"""
Vanilla attention layer is implemented by linear layer.
Args:
input_tensor (torch.Tensor): the input of the attention layer
Returns:
hidden_states (torch.Tensor): the outputs of the attention layer
weights (torch.Tensor): the attention weights
"""
def __init__(self, hidden_dim, attn_dim):
super().__init__()
self.projection = nn.Sequential(nn.Linear(hidden_dim, attn_dim), nn.ReLU(True), nn.Linear(attn_dim, 1))
def forward(self, input_tensor):
# (B, Len, num, H) -> (B, Len, num, 1)
energy = self.projection(input_tensor)
weights = torch.softmax(energy.squeeze(-1), dim=-1)
# (B, Len, num, H) * (B, Len, num, 1) -> (B, len, H)
hidden_states = (input_tensor * weights.unsqueeze(-1)).sum(dim=-2)
return hidden_states, weights
class MultiHeadAttention(nn.Module):
"""
Multi-head Self-attention layers, a attention score dropout layer is introduced.
Args:
input_tensor (torch.Tensor): the input of the multi-head self-attention layer
attention_mask (torch.Tensor): the attention mask for input tensor
Returns:
hidden_states (torch.Tensor): the output of the multi-head self-attention layer
"""
def __init__(self, n_heads, hidden_size, hidden_dropout_prob, attn_dropout_prob, layer_norm_eps):
super(MultiHeadAttention, self).__init__()
if hidden_size % n_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (hidden_size, n_heads)
)
self.num_attention_heads = n_heads
self.attention_head_size = int(hidden_size / n_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.sqrt_attention_head_size = math.sqrt(self.attention_head_size)
self.query = nn.Linear(hidden_size, self.all_head_size)
self.key = nn.Linear(hidden_size, self.all_head_size)
self.value = nn.Linear(hidden_size, self.all_head_size)
self.softmax = nn.Softmax(dim=-1)
self.attn_dropout = nn.Dropout(attn_dropout_prob)
self.dense = nn.Linear(hidden_size, hidden_size)
self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
self.out_dropout = nn.Dropout(hidden_dropout_prob)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x
def forward(self, input_tensor, attention_mask):
mixed_query_layer = self.query(input_tensor)
mixed_key_layer = self.key(input_tensor)
mixed_value_layer = self.value(input_tensor)
query_layer = self.transpose_for_scores(mixed_query_layer).permute(0, 2, 1, 3)
key_layer = self.transpose_for_scores(mixed_key_layer).permute(0, 2, 3, 1)
value_layer = self.transpose_for_scores(mixed_value_layer).permute(0, 2, 1, 3)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer)
attention_scores = attention_scores / self.sqrt_attention_head_size
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
# [batch_size heads seq_len seq_len] scores
# [batch_size 1 1 seq_len]
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = self.softmax(attention_scores).to(attention_scores.dtype)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.attn_dropout(attention_probs)
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
hidden_states = self.dense(context_layer)
hidden_states = self.out_dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class FeedForward(nn.Module):
"""
Point-wise feed-forward layer is implemented by two dense layers.
Args:
input_tensor (torch.Tensor): the input of the point-wise feed-forward layer
Returns:
hidden_states (torch.Tensor): the output of the point-wise feed-forward layer
"""
def __init__(self, hidden_size, inner_size, hidden_dropout_prob, hidden_act, layer_norm_eps):
super(FeedForward, self).__init__()
self.dense_1 = nn.Linear(hidden_size, inner_size)
self.intermediate_act_fn = self.get_hidden_act(hidden_act)
self.dense_2 = nn.Linear(inner_size, hidden_size)
self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
self.dropout = nn.Dropout(hidden_dropout_prob)
def get_hidden_act(self, act):
ACT2FN = {
"gelu": self.gelu,
"relu": fn.relu,
"swish": self.swish,
"tanh": torch.tanh,
"sigmoid": torch.sigmoid,
}
return ACT2FN[act]
def gelu(self, x):
"""Implementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results)::
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
def swish(self, x):
return x * torch.sigmoid(x)
def forward(self, input_tensor):
hidden_states = self.dense_1(input_tensor)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.dense_2(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class TransformerLayer(nn.Module):
"""
One transformer layer consists of a multi-head self-attention layer and a point-wise feed-forward layer.
Args:
hidden_states (torch.Tensor): the input of the multi-head self-attention sublayer
attention_mask (torch.Tensor): the attention mask for the multi-head self-attention sublayer
Returns:
feedforward_output (torch.Tensor): The output of the point-wise feed-forward sublayer,
is the output of the transformer layer.
"""
def __init__(
self, n_heads, hidden_size, intermediate_size, hidden_dropout_prob, attn_dropout_prob, hidden_act,
layer_norm_eps
):
super(TransformerLayer, self).__init__()
self.multi_head_attention = MultiHeadAttention(
n_heads, hidden_size, hidden_dropout_prob, attn_dropout_prob, layer_norm_eps
)
self.feed_forward = FeedForward(hidden_size, intermediate_size, hidden_dropout_prob, hidden_act, layer_norm_eps)
def forward(self, hidden_states, attention_mask):
attention_output = self.multi_head_attention(hidden_states, attention_mask)
feedforward_output = self.feed_forward(attention_output)
return feedforward_output
class TransformerEncoder(nn.Module):
r""" One TransformerEncoder consists of several TransformerLayers.
Args:
n_layers(num): num of transformer layers in transformer encoder. Default: 2
n_heads(num): num of attention heads for multi-head attention layer. Default: 2
hidden_size(num): the input and output hidden size. Default: 64
inner_size(num): the dimensionality in feed-forward layer. Default: 256
hidden_dropout_prob(float): probability of an element to be zeroed. Default: 0.5
attn_dropout_prob(float): probability of an attention score to be zeroed. Default: 0.5
hidden_act(str): activation function in feed-forward layer. Default: 'gelu'
candidates: 'gelu', 'relu', 'swish', 'tanh', 'sigmoid'
layer_norm_eps(float): a value added to the denominator for numerical stability. Default: 1e-12
"""
def __init__(
self,
n_layers=2,
n_heads=2,
hidden_size=64,
inner_size=256,
hidden_dropout_prob=0.5,
attn_dropout_prob=0.5,
hidden_act='gelu',
layer_norm_eps=1e-12
):
super(TransformerEncoder, self).__init__()
layer = TransformerLayer(
n_heads, hidden_size, inner_size, hidden_dropout_prob, attn_dropout_prob, hidden_act, layer_norm_eps
)
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(n_layers)])
def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
"""
Args:
hidden_states (torch.Tensor): the input of the TransformerEncoder
attention_mask (torch.Tensor): the attention mask for the input hidden_states
output_all_encoded_layers (Bool): whether output all transformer layers' output
Returns:
all_encoder_layers (list): if output_all_encoded_layers is True, return a list consists of all transformer
layers' output, otherwise return a list only consists of the output of last transformer layer.
"""
all_encoder_layers = []
for layer_module in self.layer:
hidden_states = layer_module(hidden_states, attention_mask)
if output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
if not output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
return all_encoder_layers
class ItemToInterestAggregation(nn.Module):
def __init__(self, seq_len, hidden_size, k_interests=5):
super().__init__()
self.k_interests = k_interests # k latent interests
self.theta = nn.Parameter(torch.randn([hidden_size, k_interests]))
def forward(self, input_tensor): # [B, L, d] -> [B, k, d]
D_matrix = torch.matmul(input_tensor, self.theta) # [B, L, k]
D_matrix = nn.Softmax(dim=-2)(D_matrix)
result = torch.einsum('nij, nik -> nkj', input_tensor, D_matrix) # [B, k, d]
return result
class LightMultiHeadAttention(nn.Module):
def __init__(self, n_heads, k_interests, hidden_size, seq_len, hidden_dropout_prob, attn_dropout_prob, layer_norm_eps):
super(LightMultiHeadAttention, self).__init__()
if hidden_size % n_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (hidden_size, n_heads))
self.num_attention_heads = n_heads
self.attention_head_size = int(hidden_size / n_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
# initialization for low-rank decomposed self-attention
self.query = nn.Linear(hidden_size, self.all_head_size)
self.key = nn.Linear(hidden_size, self.all_head_size)
self.value = nn.Linear(hidden_size, self.all_head_size)
self.attpooling_key = ItemToInterestAggregation(seq_len, hidden_size, k_interests)
self.attpooling_value = ItemToInterestAggregation(seq_len, hidden_size, k_interests)
# initialization for decoupled position encoding
self.attn_scale_factor = 2
self.pos_q_linear = nn.Linear(hidden_size, self.all_head_size)
self.pos_k_linear = nn.Linear(hidden_size, self.all_head_size)
self.pos_scaling = float(self.attention_head_size * self.attn_scale_factor) ** -0.5
self.pos_ln = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
self.attn_dropout = nn.Dropout(attn_dropout_prob)
self.dense = nn.Linear(hidden_size, hidden_size)
self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
self.out_dropout = nn.Dropout(hidden_dropout_prob)
def transpose_for_scores(self, x): # transfor to multihead
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(self, input_tensor, pos_emb):
# linear map
mixed_query_layer = self.query(input_tensor)
mixed_key_layer = self.key(input_tensor)
mixed_value_layer = self.value(input_tensor)
# low-rank decomposed self-attention: relation of items
query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(self.attpooling_key(mixed_key_layer))
value_layer = self.transpose_for_scores(self.attpooling_value(mixed_value_layer))
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-2)(attention_scores)
attention_probs = self.attn_dropout(attention_probs)
context_layer_item = torch.matmul(attention_probs, value_layer)
# decoupled position encoding: relation of positions
value_layer_pos = self.transpose_for_scores(mixed_value_layer)
pos_emb = self.pos_ln(pos_emb).unsqueeze(0)
pos_query_layer = self.transpose_for_scores(self.pos_q_linear(pos_emb)) * self.pos_scaling
pos_key_layer = self.transpose_for_scores(self.pos_k_linear(pos_emb))
abs_pos_bias = torch.matmul(pos_query_layer, pos_key_layer.transpose(-1, -2))
abs_pos_bias = abs_pos_bias / math.sqrt(self.attention_head_size)
abs_pos_bias = nn.Softmax(dim=-2)(abs_pos_bias)
context_layer_pos = torch.matmul(abs_pos_bias, value_layer_pos)
context_layer = context_layer_item + context_layer_pos
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
hidden_states = self.dense(context_layer)
hidden_states = self.out_dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class LightTransformerLayer(nn.Module):
"""
One transformer layer consists of a multi-head self-attention layer and a point-wise feed-forward layer.
Args:
hidden_states (torch.Tensor): the input of the multi-head self-attention sublayer
attention_mask (torch.Tensor): the attention mask for the multi-head self-attention sublayer
Returns:
feedforward_output (torch.Tensor): the output of the point-wise feed-forward sublayer, is the output of the transformer layer
"""
def __init__(self, n_heads, k_interests, hidden_size, seq_len, intermediate_size,
hidden_dropout_prob, attn_dropout_prob, hidden_act, layer_norm_eps):
super(LightTransformerLayer, self).__init__()
self.multi_head_attention = LightMultiHeadAttention(n_heads, k_interests, hidden_size,
seq_len, hidden_dropout_prob, attn_dropout_prob, layer_norm_eps)
self.feed_forward = FeedForward(hidden_size, intermediate_size,
hidden_dropout_prob, hidden_act, layer_norm_eps)
def forward(self, hidden_states, pos_emb):
attention_output = self.multi_head_attention(hidden_states, pos_emb)
feedforward_output = self.feed_forward(attention_output)
return feedforward_output
class LightTransformerEncoder(nn.Module):
r""" One LightTransformerEncoder consists of several LightTransformerLayers.
Args:
n_layers(num): num of transformer layers in transformer encoder. Default: 2
n_heads(num): num of attention heads for multi-head attention layer. Default: 2
hidden_size(num): the input and output hidden size. Default: 64
inner_size(num): the dimensionality in feed-forward layer. Default: 256
hidden_dropout_prob(float): probability of an element to be zeroed. Default: 0.5
attn_dropout_prob(float): probability of an attention score to be zeroed. Default: 0.5
hidden_act(str): activation function in feed-forward layer. Default: 'gelu'.
candidates: 'gelu', 'relu', 'swish', 'tanh', 'sigmoid'
layer_norm_eps(float): a value added to the denominator for numerical stability. Default: 1e-12
"""
def __init__(self,
n_layers=2,
n_heads=2,
k_interests=5,
hidden_size=64,
seq_len=50,
inner_size=256,
hidden_dropout_prob=0.5,
attn_dropout_prob=0.5,
hidden_act='gelu',
layer_norm_eps=1e-12):
super(LightTransformerEncoder, self).__init__()
layer = LightTransformerLayer(n_heads, k_interests, hidden_size, seq_len, inner_size,
hidden_dropout_prob, attn_dropout_prob, hidden_act, layer_norm_eps)
self.layer = nn.ModuleList([copy.deepcopy(layer)
for _ in range(n_layers)])
def forward(self, hidden_states, pos_emb, output_all_encoded_layers=True):
"""
Args:
hidden_states (torch.Tensor): the input of the TrandformerEncoder
attention_mask (torch.Tensor): the attention mask for the input hidden_states
output_all_encoded_layers (Bool): whether output all transformer layers' output
Returns:
all_encoder_layers (list): if output_all_encoded_layers is True, return a list consists of all transformer layers' output,
otherwise return a list only consists of the output of last transformer layer.
"""
all_encoder_layers = []
for layer_module in self.layer:
hidden_states = layer_module(hidden_states, pos_emb)
if output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
if not output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
return all_encoder_layers
class CNNLayers(nn.Module):
r""" CNNLayers
Args:
- channels(list): a list contains the channels of each layer in cnn layers
- kernel(list): a list contains the kernels of each layer in cnn layers
- strides(list): a list contains the channels of each layer in cnn layers
- activation(str): activation function after each layer in mlp layers. Default: 'relu'
candidates: 'sigmoid', 'tanh', 'relu', 'leekyrelu', 'none'
Shape:
- Input: :math:`(N, C_{in}, H_{in}, W_{in})`
- Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
.. math::
H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[0] - \text{dilation}[0]
\times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
.. math::
W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[1] - \text{dilation}[1]
\times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
Examples::
>>> m = CNNLayers([1, 32, 32], [2,2], [2,2], 'relu')
>>> input = torch.randn(128, 1, 64, 64)
>>> output = m(input)
>>> print(output.size())
>>> torch.Size([128, 32, 16, 16])
"""
def __init__(self, channels, kernels, strides, activation='relu', init_method=None):
super(CNNLayers, self).__init__()
self.channels = channels
self.kernels = kernels
self.strides = strides
self.activation = activation
self.init_method = init_method
self.num_of_nets = len(self.channels) - 1
if len(kernels) != len(strides) or self.num_of_nets != (len(kernels)):
raise RuntimeError('channels, kernels and strides don\'t match\n')
cnn_modules = []
for i in range(self.num_of_nets):
cnn_modules.append(
nn.Conv2d(self.channels[i], self.channels[i + 1], self.kernels[i], stride=self.strides[i])
)
if self.activation.lower() == 'sigmoid':
cnn_modules.append(nn.Sigmoid())
elif self.activation.lower() == 'tanh':
cnn_modules.append(nn.Tanh())
elif self.activation.lower() == 'relu':
cnn_modules.append(nn.ReLU())
elif self.activation.lower() == 'leakyrelu':
cnn_modules.append(nn.LeakyReLU())
elif self.activation.lower() == 'none':
pass
self.cnn_layers = nn.Sequential(*cnn_modules)
if self.init_method is not None:
self.apply(self.init_weights)
def init_weights(self, module):
# We just initialize the module with normal distribution as the paper said
if isinstance(module, nn.Conv2d):
if self.init_method == 'norm':
normal_(module.weight.data, 0, 0.01)
if module.bias is not None:
module.bias.data.fill_(0.0)
def forward(self, input_feature):
return self.cnn_layers(input_feature)
class SparseDropout(nn.Module):
"""
This is a Module that execute Dropout on Pytorch sparse tensor.
"""
def __init__(self, p=0.5):
super(SparseDropout, self).__init__()
# p is ratio of dropout
# convert to keep probability
self.kprob = 1 - p
def forward(self, x):
if not self.training:
return x
mask = ((torch.rand(x._values().size()) + self.kprob).floor()).type(torch.bool)
rc = x._indices()[:, mask]
val = x._values()[mask] * (1.0 / self.kprob)
return torch.sparse.FloatTensor(rc, val, x.shape)
from .trainer import *
__all__ = ['Trainer']
# Copyright (c) 2024 westlake-repl
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliate
# SPDX-License-Identifier: MIT
# This file has been modified by Junyi Chen.
#
# Original file was released under MIT, with the full license text
# available at https://choosealicense.com/licenses/mit/.
#
# This modified file is released under the same license.
import os
import sys
from logging import getLogger
from time import time
import time as t
import numpy as np
import torch
import torch.optim as optim
import torch.distributed as dist
from tqdm import tqdm
import deepspeed
from REC.data.dataset import BatchTextDataset
from REC.data.dataset.collate_fn import customize_rmpad_collate
from torch.utils.data import DataLoader
from REC.evaluator import Evaluator, Collector
from REC.utils import ensure_dir, get_local_time, early_stopping, calculate_valid_score, dict2str, \
get_tensorboard, set_color, get_gpu_usage, WandbLogger
from REC.utils.lr_scheduler import *
import lightning as L
from lightning.fabric.strategies import DeepSpeedStrategy, DDPStrategy
class Trainer(object):
def __init__(self, config, model):
super(Trainer, self).__init__()
self.config = config
self.model = model
self.logger = getLogger()
self.wandblogger = WandbLogger(config)
self.optim_args = config['optim_args']
self.epochs = config['epochs']
self.eval_step = min(config['eval_step'], self.epochs)
self.stopping_step = config['stopping_step']
self.clip_grad_norm = config.get('clip_grad_norm', 1.0)
self.valid_metric = config['valid_metric'].lower()
self.valid_metric_bigger = config['valid_metric_bigger']
self.test_batch_size = config['eval_batch_size']
self.gpu_available = torch.cuda.is_available() and config['use_gpu']
self.device = config['device']
self.rank = torch.distributed.get_rank()
if self.rank == 0:
self.tensorboard = get_tensorboard(self.logger)
self.checkpoint_dir = config['checkpoint_dir']
if self.rank == 0:
ensure_dir(self.checkpoint_dir)
self.saved_model_name = '{}-{}.pth'.format(self.config['model'], 0)
self.saved_model_file = os.path.join(self.checkpoint_dir, self.saved_model_name)
self.use_text = config['use_text']
self.start_epoch = 0
self.cur_step = 0
self.best_valid_score = -np.inf if self.valid_metric_bigger else np.inf
self.best_valid_result = None
self.train_loss_dict = dict()
self.optimizer = self._build_optimizer()
self.update_interval = config['update_interval'] if config['update_interval'] else 20
self.scheduler_config = config['scheduler_args']
if config['freeze_prefix'] or config['freeze_ad']:
freeze_prefix = config['freeze_prefix'] if config['freeze_prefix'] else []
if config['freeze_ad']:
freeze_prefix.extend(['item_llm', 'item_emb_tokens'])
if not config['ft_item']:
freeze_prefix.extend(['item_embedding'])
self._freeze_params(freeze_prefix)
for n, p in self.model.named_parameters():
self.logger.info(f"{n} {p.size()} {p.requires_grad}")
self.eval_collector = Collector(config)
self.evaluator = Evaluator(config)
self.item_feature = None
self.tot_item_num = None
def _freeze_params(self, freeze_prefix):
for name, param in self.model.named_parameters():
for prefix in freeze_prefix:
if name.startswith(prefix):
self.logger.info(f"freeze_params: {name}")
param.requires_grad = False
def _build_scheduler(self, warmup_steps=None, tot_steps=None):
if self.scheduler_config['type'] == 'cosine':
self.logger.info(f"Use consine scheduler with {warmup_steps} warmup {tot_steps} total steps")
return get_cosine_schedule_with_warmup(self.optimizer, warmup_steps, tot_steps)
elif self.scheduler_config['type'] == 'liner':
self.logger.info(f"Use linear scheduler with {warmup_steps} warmup {tot_steps} total steps")
return get_linear_schedule_with_warmup(self.optimizer, warmup_steps, tot_steps)
else:
self.logger.info(f"Use constant scheduler")
return get_constant_schedule(self.optimizer)
def _build_optimizer(self):
if len(self.optim_args) == 4:
params = self.model.named_parameters()
modal_params = []
recsys_params = []
modal_decay_params = []
recsys_decay_params = []
decay_check_name = self.config['decay_check_name']
for index, (name, param) in enumerate(params):
if param.requires_grad:
if 'visual_encoder' in name:
modal_params.append(param)
else:
recsys_params.append(param)
if decay_check_name:
if decay_check_name in name:
modal_decay_params.append(param)
else:
recsys_decay_params.append(param)
if decay_check_name:
optimizer = optim.AdamW([
{'params': modal_decay_params, 'lr': self.optim_args['modal_lr'], 'weight_decay': self.optim_args['modal_decay']},
{'params': recsys_decay_params, 'lr': self.optim_args['rec_lr'], 'weight_decay': self.optim_args['rec_decay']}
])
optim_output = set_color(f'recsys_decay_params_len: {len(recsys_decay_params)} modal_params_decay_len: {len(modal_decay_params)}', 'blue')
self.logger.info(optim_output)
else:
optimizer = optim.AdamW([
{'params': modal_params, 'lr': self.optim_args['modal_lr'], 'weight_decay': self.optim_args['modal_decay']},
{'params': recsys_params, 'lr': self.optim_args['rec_lr'], 'weight_decay': self.optim_args['rec_decay']}
])
optim_output = set_color(f'recsys_lr_params_len: {len(recsys_params)} modal_lr_params_len: {len(modal_params)}', 'blue')
self.logger.info(optim_output)
elif self.config['lr_mult_prefix'] and self.config['lr_mult_rate']:
normal_params_dict = {
"params": [],
"lr": self.optim_args['learning_rate'],
"weight_decay": self.optim_args['weight_decay']
}
high_lr_params_dict = {
"params": [],
"lr": self.optim_args['learning_rate'] * self.config['lr_mult_rate'],
"weight_decay": self.optim_args['weight_decay']
}
self.logger.info(f'Use higher lr rate {self.config["lr_mult_rate"]} x {self.optim_args["learning_rate"]} for prefix {self.config["lr_mult_prefix"]}')
for n, p in self.model.named_parameters():
if any(n.startswith(x) for x in self.config['lr_mult_prefix']):
self.logger.info(f"high lr param: {n} {self.optim_args['learning_rate'] * self.config['lr_mult_rate']}")
high_lr_params_dict["params"].append(p)
else:
normal_params_dict["params"].append(p)
optimizer = optim.AdamW([normal_params_dict, high_lr_params_dict])
elif self.config['optimizer_kwargs']:
params = self.model.parameters()
self.config['optimizer_kwargs']['optimizer']['params']['lr'] = self.optim_args['learning_rate']
self.config['optimizer_kwargs']['optimizer']['params']['weight_decay'] = self.optim_args['weight_decay']
optimizer = deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam(params, **self.config['optimizer_kwargs']['optimizer']['params'])
else:
params = self.model.parameters()
optimizer = optim.AdamW(params, lr=self.optim_args['learning_rate'], weight_decay=self.optim_args['weight_decay'])
return optimizer
def _train_epoch(self, train_data, epoch_idx, show_progress=False):
self.model.train()
total_loss = 0
if self.rank == 0:
pbar = tqdm(
total=len(train_data),
miniters=self.update_interval,
desc=set_color(f"Train [{epoch_idx:>3}/{self.epochs:>3}]", 'pink'),
file=sys.stdout
)
bwd_time = t.time()
for batch_idx, data in enumerate(train_data):
start_time = bwd_time
self.optimizer.zero_grad()
data = self.to_device(data)
data_time = t.time()
losses = self.model(data)
fwd_time = t.time()
if self.config['loss'] and self.config['loss'] == 'nce':
model_out = losses
losses = model_out.pop('loss')
self._check_nan(losses)
total_loss = total_loss + losses.item()
self.lite.backward(losses)
grad_norm = self.optimizer.step()
bwd_time = t.time()
if self.scheduler_config:
self.lr_scheduler.step()
if show_progress and self.rank == 0 and batch_idx % self.update_interval == 0:
msg = f"loss: {losses:.4f} data: {data_time-start_time:.3f} fwd: {fwd_time-data_time:.3f} bwd: {bwd_time-fwd_time:.3f}"
if self.scheduler_config:
msg = f"lr: {self.lr_scheduler.get_lr()[0]:.7f} " + msg
if self.config['loss'] and self.config['loss'] == 'nce':
for k, v in model_out.items():
msg += f" {k}: {v:.3f}"
if grad_norm:
msg = msg + f" grad_norm: {grad_norm.sum():.4f}"
pbar.set_postfix_str(msg, refresh=False)
pbar.update(self.update_interval)
self.logger.info("\n" + "-"*50)
if self.config['debug'] and batch_idx >= 10:
break
return total_loss
def _valid_epoch(self, valid_data, show_progress=False):
torch.distributed.barrier()
valid_result = self.evaluate(valid_data, load_best_model=False, show_progress=show_progress)
valid_score = calculate_valid_score(valid_result, self.valid_metric)
torch.distributed.barrier()
return valid_score, valid_result
def _save_checkpoint(self, epoch, verbose=True):
r"""Store the model parameters information and training information.
Args:
epoch (int): the current epoch id
"""
state = {
"model": self.model,
"optimizer": self.optimizer,
'config': self.config,
'epoch': epoch,
'cur_step': self.cur_step,
'best_valid_score': self.best_valid_score,
'rng_state': torch.get_rng_state(),
'cuda_rng_state': torch.cuda.get_rng_state()
}
self.lite.save(os.path.join(self.checkpoint_dir, self.saved_model_name), state=state)
if self.rank == 0 and verbose:
self.logger.info(set_color('Saving current', 'blue') + f': {self.saved_model_file}')
def _check_nan(self, loss):
if torch.isnan(loss):
raise ValueError('Training loss is nan')
def _generate_train_loss_output(self, epoch_idx, s_time, e_time, losses):
des = self.config['loss_decimal_place'] or 4
train_loss_output = (set_color('epoch %d training', 'green') + ' [' + set_color('time', 'blue') +
': %.2fs, ') % (epoch_idx, e_time - s_time)
if isinstance(losses, tuple):
des = (set_color('train_loss%d', 'blue') + ': %.' + str(des) + 'f')
train_loss_output += ', '.join(des % (idx + 1, loss) for idx, loss in enumerate(losses))
else:
des = '%.' + str(des) + 'f'
train_loss_output += set_color('train loss', 'blue') + ': ' + des % losses
return train_loss_output + ']'
def _add_train_loss_to_tensorboard(self, epoch_idx, losses, tag='Loss/Train'):
if isinstance(losses, tuple):
for idx, loss in enumerate(losses):
self.tensorboard.add_scalar(tag + str(idx), loss, epoch_idx)
else:
self.tensorboard.add_scalar(tag, losses, epoch_idx)
def _add_hparam_to_tensorboard(self, best_valid_result):
# base hparam
hparam_dict = {
'learning_rate': self.config['learning_rate'],
'weight_decay': self.config['weight_decay'],
'train_batch_size': self.config['train_batch_size']
}
# unrecorded parameter
unrecorded_parameter = {
parameter
for parameters in self.config.parameters.values() for parameter in parameters
}.union({'model', 'dataset', 'config_files', 'device'})
# other model-specific hparam
hparam_dict.update({
para: val
for para, val in self.config.final_config_dict.items() if para not in unrecorded_parameter
})
for k in hparam_dict:
k = k.replace('@', '_')
if hparam_dict[k] is not None and not isinstance(hparam_dict[k], (bool, str, float, int)):
hparam_dict[k] = str(hparam_dict[k])
self.tensorboard.add_hparams(hparam_dict, {'hparam/best_valid_result': best_valid_result})
def to_device(self, data):
device = self.device
if isinstance(data, tuple) or isinstance(data, list):
tdata = ()
for d in data:
d = d.to(device)
tdata += (d,)
return tdata
elif isinstance(data, dict):
for k, v in data.items():
data[k] = v.to(device)
return data
else:
return data.to(device)
def fit(self, train_data, valid_data=None, verbose=True, saved=True, show_progress=False, callback_fn=None):
if self.scheduler_config:
warmup_rate = self.scheduler_config.get('warmup', 0.001)
tot_steps = len(train_data) * self.epochs
warmup_steps = tot_steps * warmup_rate
self.lr_scheduler = self._build_scheduler(warmup_steps=warmup_steps, tot_steps=tot_steps)
world_size, local_world_size = int(os.environ['WORLD_SIZE']), int(os.environ['LOCAL_WORLD_SIZE'])
nnodes = world_size // local_world_size
precision = self.config['precision'] if self.config['precision'] else '32'
if self.config['strategy'] == 'deepspeed':
self.logger.info(f"Use deepspeed strategy")
strategy = DeepSpeedStrategy(stage=self.config["stage"], precision=precision)
self.lite = L.Fabric(accelerator='gpu', strategy=strategy, precision=precision, num_nodes=nnodes)
else:
self.logger.info(f"Use DDP strategy")
strategy = DDPStrategy(find_unused_parameters=True)
self.lite = L.Fabric(accelerator='gpu', strategy=strategy, precision=precision, num_nodes=nnodes)
self.lite.launch()
self.model, self.optimizer = self.lite.setup(self.model, self.optimizer)
if self.config['auto_resume']:
raise NotImplementedError
valid_step = 0
for epoch_idx in range(self.start_epoch, self.epochs):
# train
if self.config['need_training'] == None or self.config['need_training']:
train_data.sampler.set_epoch(epoch_idx)
training_start_time = time()
train_loss = self._train_epoch(train_data, epoch_idx, show_progress=show_progress)
self.train_loss_dict[epoch_idx] = sum(train_loss) if isinstance(train_loss, tuple) else train_loss
training_end_time = time()
train_loss_output = \
self._generate_train_loss_output(epoch_idx, training_start_time, training_end_time, train_loss)
if verbose:
self.logger.info(train_loss_output)
if self.rank == 0:
self._add_train_loss_to_tensorboard(epoch_idx, train_loss)
self.wandblogger.log_metrics({'epoch': epoch_idx, 'train_loss': train_loss, 'train_step': epoch_idx}, head='train')
if self.eval_step <= 0 or not valid_data:
if saved:
self._save_checkpoint(epoch_idx, verbose=verbose)
continue
if (epoch_idx + 1) % self.eval_step == 0:
valid_start_time = time()
valid_score, valid_result = self._valid_epoch(valid_data, show_progress=show_progress)
self.best_valid_score, self.cur_step, stop_flag, update_flag = early_stopping(
valid_score,
self.best_valid_score,
self.cur_step,
max_step=self.stopping_step,
bigger=self.valid_metric_bigger
)
valid_end_time = time()
valid_score_output = (set_color("epoch %d evaluating", 'green') + " [" + set_color("time", 'blue')
+ ": %.2fs, " + set_color("valid_score", 'blue') + ": %f]") % \
(epoch_idx, valid_end_time - valid_start_time, valid_score)
valid_result_output = set_color('valid result', 'blue') + ': \n' + dict2str(valid_result)
if verbose:
self.logger.info(valid_score_output)
self.logger.info(valid_result_output)
if self.rank == 0:
self.tensorboard.add_scalar('Vaild_score', valid_score, epoch_idx)
for name, value in valid_result.items():
self.tensorboard.add_scalar(name.replace('@', '_'), value, epoch_idx)
self.wandblogger.log_metrics({**valid_result, 'valid_step': valid_step}, head='valid')
if update_flag:
if saved:
self._save_checkpoint(epoch_idx, verbose=verbose)
self.best_valid_result = valid_result
if callback_fn:
callback_fn(epoch_idx, valid_score)
if stop_flag:
stop_output = 'Finished training, best eval result in epoch %d' % \
(epoch_idx - self.cur_step * self.eval_step)
if verbose:
self.logger.info(stop_output)
break
valid_step += 1
return self.best_valid_score, self.best_valid_result
@torch.no_grad()
def _full_sort_batch_eval(self, batched_data):
user, time_seq, history_index, positive_u, positive_i = batched_data
interaction = self.to_device(user)
time_seq = self.to_device(time_seq)
if self.config['model'] == 'HLLM':
if self.config['stage'] == 3:
scores = self.model.module.predict(interaction, time_seq, self.item_feature)
else:
scores = self.model((interaction, time_seq, self.item_feature), mode='predict')
else:
scores = self.model.module.predict(interaction, time_seq, self.item_feature)
scores = scores.view(-1, self.tot_item_num)
scores[:, 0] = -np.inf
if history_index is not None:
scores[history_index] = -np.inf
return scores, positive_u, positive_i
@torch.no_grad()
def compute_item_feature(self, config, data):
if self.use_text:
item_data = BatchTextDataset(config, data)
item_batch_size = config['MAX_ITEM_LIST_LENGTH'] * config['train_batch_size']
item_loader = DataLoader(item_data, batch_size=item_batch_size, num_workers=8, shuffle=False, pin_memory=True, collate_fn=customize_rmpad_collate)
self.logger.info(f"Inference item_data with {item_batch_size = } {len(item_loader) = }")
self.item_feature = []
with torch.no_grad():
for idx, items in tqdm(enumerate(item_loader), total=len(item_loader)):
items = self.to_device(items)
items = self.model(items, mode='compute_item')
self.item_feature.append(items)
if isinstance(items, tuple):
self.item_feature = torch.cat([x[0] for x in self.item_feature]), torch.cat([x[1] for x in self.item_feature])
else:
self.item_feature = torch.cat(self.item_feature)
if self.config['stage'] == 3:
self.item_feature = self.item_feature.bfloat16()
else:
with torch.no_grad():
self.item_feature = self.model.module.compute_item_all()
def distributed_concat(self, tensor, num_total_examples):
output_tensors = [tensor.clone() for _ in range(torch.distributed.get_world_size())]
torch.distributed.all_gather(output_tensors, tensor)
concat = torch.cat(output_tensors, dim=0)
return concat.sum() / num_total_examples
def evaluate(self, eval_data, load_best_model=True, model_file=None, show_progress=False, init_model=False):
if not eval_data:
return
if init_model:
world_size, local_world_size = int(os.environ['WORLD_SIZE']), int(os.environ['LOCAL_WORLD_SIZE'])
nnodes = world_size // local_world_size
if self.config['strategy'] == 'deepspeed':
self.logger.info(f"Use deepspeed strategy")
precision = self.config['precision'] if self.config['precision'] else '32'
strategy = DeepSpeedStrategy(stage=self.config['stage'], precision=precision)
self.lite = L.Fabric(accelerator='gpu', strategy=strategy, precision=precision, num_nodes=nnodes)
self.lite.launch()
self.model, self.optimizer = self.lite.setup(self.model, self.optimizer)
else:
self.logger.info(f"Use DDP strategy")
precision = self.config['precision'] if self.config['precision'] else '32'
strategy = DDPStrategy(find_unused_parameters=True)
self.lite = L.Fabric(accelerator='gpu', strategy=strategy, precision=precision, num_nodes=nnodes)
self.lite.launch()
self.model = self.lite.setup(self.model)
if load_best_model:
checkpoint_file = model_file or self.saved_model_file
state = {"model": self.model}
self.lite.load(checkpoint_file, state)
message_output = 'Loading model structure and parameters from {}'.format(checkpoint_file)
self.logger.info(message_output)
with torch.no_grad():
self.model.eval()
eval_func = self._full_sort_batch_eval
self.tot_item_num = eval_data.dataset.dataload.item_num
self.compute_item_feature(self.config, eval_data.dataset.dataload)
iter_data = (
tqdm(
eval_data,
total=len(eval_data),
ncols=150,
desc=set_color(f"Evaluate ", 'pink'),
file=sys.stdout
) if show_progress and self.rank == 0 else eval_data
)
fwd_time = t.time()
for batch_idx, batched_data in enumerate(iter_data):
start_time = fwd_time
data_time = t.time()
scores, positive_u, positive_i = eval_func(batched_data)
fwd_time = t.time()
if show_progress and self.rank == 0:
iter_data.set_postfix_str(f"data: {data_time-start_time:.3f} fwd: {fwd_time-data_time:.3f}", refresh=False)
self.eval_collector.eval_batch_collect(scores, positive_u, positive_i)
num_total_examples = len(eval_data.sampler.dataset)
struct = self.eval_collector.get_data_struct()
result = self.evaluator.evaluate(struct)
metric_decimal_place = 5 if self.config['metric_decimal_place'] == None else self.config['metric_decimal_place']
for k, v in result.items():
result_cpu = self.distributed_concat(torch.tensor([v]).to(self.device), num_total_examples).cpu()
result[k] = round(result_cpu.item(), metric_decimal_place)
self.wandblogger.log_eval_metrics(result, head='eval')
return result
from .logger import init_logger, set_color
from .utils import get_local_time, ensure_dir, get_model, \
early_stopping, calculate_valid_score, dict2str, init_seed, get_tensorboard, get_gpu_usage
from .enum_type import *
from .argument_list import *
from .wandblogger import WandbLogger
__all__ = [
'init_logger', 'get_local_time', 'ensure_dir', 'get_model', 'early_stopping',
'calculate_valid_score', 'dict2str', 'Enum', 'EvaluatorType', 'InputType',
'init_seed', 'general_arguments', 'training_arguments', 'evaluation_arguments',
'dataset_arguments', 'get_tensorboard', 'set_color', 'get_gpu_usage', 'WandbLogger'
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment