Unverified Commit 7e9f10ac authored by Susnato Dhar's avatar Susnato Dhar Committed by GitHub
Browse files

Add CLVP (#24745)

* init commit

* attention arch done except rotary emb

* rotary emb done

* text encoder working

* outputs matching

* arch first pass done

* make commands done, tests and docs remaining

* all tests passed, only docs remaining

* docs done

* doc-builder fix

* convert script removed(not relevant)

* minor comments done

* added ckpt conversion script

* tokenizer done

* very minor fix of index.md 2

* mostly make fixup related

* all done except fe and rotary emb

* very small change

* removed unidecode dependency

* style changes

* tokenizer removed require_backends

* added require_inflect to tokenizer tests

* removed VOCAB_FILES in tokenizer test

* inflect dependency removed

* added rotary pos emb cache and simplified the apply method

* style

* little doc change

* more comments

* feature extractor added

* added processor

* auto-regressive config added

* added CLVPConditioningEncoder

* comments done except the test one

* weights added successfull(NOT tested)

* tokenizer fix with numbers

* generate outputs matching

* almost tests passing Integ tests not written

* Integ tests added

* major CUDA error fixed

* docs done

* rebase and multiple fixes

* fixed rebase overwrites

* generate code simplified and tests for AutoRegressive model added

* minor changes

* refectored gpt2 code in clvp file

* weights done and all code refactored

* mostly done except the fast_tokenizer

* doc test fix

* config file's doc fixes

* more config fix

* more comments

* tokenizer comments mostly done

* modeling file mostly refactored and can load modules

* ClvpEncoder tested

* ClvpDecoder, ClvpModel and ClvpForCausalLM tested

* integration and all tests passed

* more fixes

* docs almost done

* ckpt conversion refectored

* style and some failing tests fix

* comments

* temporary output fix but test_assisted_decoding_matches_greedy_search test fails

* majority changes done

* use_cache outputs same now! Along with the asisted_greedy_decoding test fix

* more comments

* more comments

* prepare_inputs_for_generation fixed and _prepare_model_inputs added

* style fix

* clvp.md change

* moved clvpconditionalencoder norms

* add model to new index

* added tokenizer input_ids_with_special_tokens

* small fix

* config mostly done

* added config-tester and changed conversion script

* more comments

* comments

* style fix

* some comments

* tokenizer changed back to prev state

* small commnets

* added output hidden states for the main model

* style fix

* comments

* small change

* revert small change

* .

* Update clvp.md

* Update test_modeling_clvp.py

* :)

* some minor change

* new fixes

* remove to_dict from FE
parent 9dd58c53
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Feature extractor class for CLVP
"""
from typing import List, Optional, Union
import numpy as np
from ...audio_utils import mel_filter_bank, spectrogram, window_function
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
from ...feature_extraction_utils import BatchFeature
from ...utils import TensorType, logging
logger = logging.get_logger(__name__)
class ClvpFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a CLVP feature extractor.
This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.
This class extracts log-mel-spectrogram features from raw speech using a custom numpy implementation of the `Short
Time Fourier Transform` which should match pytorch's `torch.stft` equivalent.
Args:
feature_size (`int`, *optional*, defaults to 80):
The feature dimension of the extracted features.
sampling_rate (`int`, *optional*, defaults to 22050):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
default_audio_length (`int`, *optional*, defaults to 6):
The default length of raw audio in seconds. If `max_length` is not set during `__call__` then it will
automatically be set to default_audio_length * `self.sampling_rate`.
hop_length (`int`, *optional*, defaults to 256):
Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
chunk_length (`int`, *optional*, defaults to 30):
The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
sequences.
n_fft (`int`, *optional*, defaults to 1024):
Size of the Fourier transform.
padding_value (`float`, *optional*, defaults to 0.0):
Padding value used to pad the audio. Should correspond to silences.
mel_norms (`list` of length `feature_size`, *optional*):
If `mel_norms` is provided then it will be used to normalize the log-mel spectrograms along each
mel-filter.
return_attention_mask (`bool`, *optional*, defaults to `False`):
Whether to return the attention mask. If left to the default, it will return the attention mask.
[What are attention masks?](../glossary#attention-mask)
"""
model_input_names = ["input_features", "attention_mask"]
def __init__(
self,
feature_size=80,
sampling_rate=22050,
default_audio_length=6,
hop_length=256,
chunk_length=30,
n_fft=1024,
padding_value=0.0,
mel_norms=None,
return_attention_mask=False, # pad inputs to max length with silence token (zero) and no attention mask
**kwargs,
):
super().__init__(
feature_size=feature_size,
sampling_rate=sampling_rate,
padding_value=padding_value,
return_attention_mask=return_attention_mask,
**kwargs,
)
self.n_fft = n_fft
self.hop_length = hop_length
self.chunk_length = chunk_length
self.n_samples = chunk_length * sampling_rate
self.nb_max_frames = self.n_samples // hop_length
self.sampling_rate = sampling_rate
self.default_audio_length = default_audio_length
self.mel_norms = mel_norms
self.mel_filters = mel_filter_bank(
num_frequency_bins=1 + (n_fft // 2),
num_mel_filters=feature_size,
min_frequency=0.0,
max_frequency=8000.0,
sampling_rate=sampling_rate,
norm="slaney",
mel_scale="htk",
)
def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
"""
This method first computes the log-mel spectrogram of the provided audio then applies normalization along the
each mel-filterbank, if `mel_norms` is provided.
"""
log_spec = spectrogram(
waveform,
window_function(self.n_fft, "hann"),
frame_length=self.n_fft,
hop_length=self.hop_length,
power=2.0,
mel_filters=self.mel_filters,
log_mel=None,
)
log_spec = np.log(np.clip(log_spec, a_min=1e-5, a_max=None))
if self.mel_norms is not None:
log_spec = log_spec / np.array(self.mel_norms)[:, None]
return log_spec
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
sampling_rate: Optional[int] = None,
truncation: bool = True,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_attention_mask: Optional[bool] = True,
padding: Optional[str] = "max_length",
max_length: Optional[int] = None,
**kwargs,
) -> BatchFeature:
"""
`ClvpFeatureExtractor` is used to extract various voice specific properties such as the pitch and tone of the
voice, speaking speed, and even speaking defects like a lisp or stuttering from a sample voice or `raw_speech`.
First the voice is padded or truncated in a way such that it becomes a waveform of `self.default_audio_length`
seconds long and then the log-mel spectrogram is extracted from it.
Args:
raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
stereo, i.e. single float per timestep.
sampling_rate (`int`, *optional*):
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
`sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
pipeline.
truncation (`bool`, *optional*, default to `True`):
Activates truncation to cut input sequences longer than *max_length* to *max_length*.
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
`>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
return_attention_mask (`bool`, *optional*, defaults to `True`):
Whether to return the attention mask. If left to the default, it will return the attention mask.
[What are attention masks?](../glossary#attention-mask)
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
padding_value (`float`, defaults to 0.0):
The value that is used to fill the padding values / vectors.
max_length (`int`, *optional*):
The maximum input length of the inputs.
"""
if sampling_rate is not None:
if sampling_rate != self.sampling_rate:
raise ValueError(
f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
f" was sampled with {self.sampling_rate} and not {sampling_rate}."
)
else:
logger.warning(
"It is strongly recommended to pass the `sampling_rate` argument to this function. "
"Failing to do so can result in silent errors that might be hard to debug."
)
is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
if is_batched_numpy and len(raw_speech.shape) > 2:
raise ValueError(f"Only mono-channel audio is supported for input to {self}")
is_batched = is_batched_numpy or (
isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
)
if is_batched:
raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
elif not is_batched and not isinstance(raw_speech, np.ndarray):
raw_speech = np.asarray(raw_speech, dtype=np.float32)
elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
raw_speech = raw_speech.astype(np.float32)
# always return batch
if not is_batched:
raw_speech = [np.asarray([raw_speech]).T]
batched_speech = BatchFeature({"input_features": raw_speech})
max_length = self.default_audio_length * self.sampling_rate if max_length is None else max_length
padded_inputs = self.pad(
batched_speech,
padding=padding,
max_length=max_length,
truncation=truncation,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
# make sure list is in array format
input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
input_features = [
self._np_extract_fbank_features(waveform).astype(np.float32) for waveform in input_features[0]
]
if isinstance(input_features[0], List):
padded_inputs["input_features"] = [np.asarray(feature) for feature in input_features]
else:
padded_inputs["input_features"] = input_features
return padded_inputs.convert_to_tensors(return_tensors)
# coding=utf-8
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch CLVP model."""
import copy
import math
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from ...activations import ACT2FN
from ...generation import GenerationConfig
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPooling,
CausalLMOutputWithCrossAttentions,
)
from ...modeling_utils import PreTrainedModel, SequenceSummary
from ...pytorch_utils import Conv1D
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_clvp import (
ClvpConfig,
ClvpDecoderConfig,
ClvpEncoderConfig,
)
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "susnato/clvp_dev"
CLVP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"susnato/clvp_dev",
# See all Clvp models at https://huggingface.co/models?filter=clvp
]
# Copied from transformers.models.clip.modeling_clip.contrastive_loss
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->clvp, image_loss->speech_loss
def clvp_loss(similarity: torch.Tensor) -> torch.Tensor:
caption_loss = contrastive_loss(similarity)
speech_loss = contrastive_loss(similarity.t())
return (caption_loss + speech_loss) / 2.0
# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors.
Args:
q (`torch.Tensor`): The query tensor.
k (`torch.Tensor`): The key tensor.
cos (`torch.Tensor`): The cosine part of the rotary embedding.
sin (`torch.Tensor`): The sine part of the rotary embedding.
position_ids (`torch.Tensor`):
The position indices of the tokens corresponding to the query and key tensors. For example, this can be
used to pass offsetted position ids when working with a KV-cache.
unsqueeze_dim (`int`, *optional*, defaults to 1):
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
@dataclass
class ClvpEncoderOutput(ModelOutput):
"""
Base class for CLVP encoder's outputs that contains a pooling of the last hidden states as well as a projection
output (a linear layer on top of the pooled output).
Args:
embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when model is initialized with `with_projection=True`):
The embeddings obtained by applying the projection layer to the pooler_output.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
The hidden state of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
Pooled output of the `last_hidden_state`.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
"""
embeds: Optional[torch.FloatTensor] = None
last_hidden_state: torch.FloatTensor = None
pooler_output: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class ClvpOutput(ModelOutput):
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for speech-text similarity.
speech_ids (`torch.LongTensor`, *optional*):
speech_ids (or speech candidates) generated by the `ClvpForCausalLM` model.
logits_per_speech (`torch.FloatTensor` of shape `(speech_batch_size, text_batch_size)`):
The scaled dot product scores between `speech_embeds` and `text_embeds`. This represents the speech-text
similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, speech_batch_size)`):
The scaled dot product scores between `text_embeds` and `speech_embeds`. This represents the text-speech
similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of the text encoder
model.
speech_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
The speech embeddings obtained by applying the projection layer to the pooled output of the speech encoder
model.
text_model_output (`BaseModelOutputWithPooling`):
The pooled output of the `last_hidden_state` of the text encoder Model.
speech_model_output (`BaseModelOutputWithPooling`):
The pooled output of the `last_hidden_state` of the speech encoder Model.
decoder_hidden_states (`torch.FloatTensor`, *optional*):
The hidden states of the decoder model.
text_encoder_hidden_states (`torch.FloatTensor`, *optional*):
The hidden states of the text encoder model.
speech_encoder_hidden_states (`torch.FloatTensor`, *optional*):
The hidden states of the speech encoder model.
"""
loss: Optional[torch.FloatTensor] = None
speech_ids: Optional[torch.LongTensor] = None
logits_per_speech: torch.FloatTensor = None
logits_per_text: torch.FloatTensor = None
text_embeds: torch.FloatTensor = None
speech_embeds: torch.FloatTensor = None
text_model_output: BaseModelOutputWithPooling = None
speech_model_output: BaseModelOutputWithPooling = None
decoder_hidden_states: torch.FloatTensor = None
text_encoder_hidden_states: torch.FloatTensor = None
speech_encoder_hidden_states: torch.FloatTensor = None
# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Clvp
class ClvpRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
ClvpRMSNorm is equivalent to T5LayerNorm
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
return self.weight * hidden_states.to(input_dtype)
class ClvpRotaryPositionalEmbedding(nn.Module):
"""
Rotary Position Embedding Class for CLVP. It was proposed in the paper 'ROFORMER: ENHANCED TRANSFORMER WITH ROTARY
POSITION EMBEDDING', Please see https://arxiv.org/pdf/2104.09864v1.pdf .
"""
def __init__(self, config):
super().__init__()
dim = max(config.projection_dim // (config.num_attention_heads * 2), 32)
inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
self.register_buffer("inv_freq", inv_freq)
self.cached_sequence_length = None
self.cached_rotary_positional_embedding = None
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
sequence_length = hidden_states.shape[1]
if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
return self.cached_rotary_positional_embedding
self.cached_sequence_length = sequence_length
time_stamps = torch.arange(sequence_length, device=hidden_states.device).type_as(self.inv_freq)
freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
embeddings = torch.cat((freqs, freqs), dim=-1)
self.cached_rotary_positional_embedding = embeddings.unsqueeze(0)
return self.cached_rotary_positional_embedding
class ClvpSelfAttention(nn.Module):
"""
Multi-headed attention to combine Absolute and Rotary Positional Embeddings into a single Attention module.
"""
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = config.attention_dropout
if hasattr(config, "max_position_embeddings"):
max_positions = config.max_position_embeddings
bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool))
bias = bias.view(1, 1, max_positions, max_positions)
self.register_buffer("bias", bias, persistent=False)
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_attention_bias)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_attention_bias)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_attention_bias)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
# Copied from transformers.models.clip.modeling_clip.CLIPAttention._shape
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.FloatTensor,
rotary_pos_emb: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
use_cache: Optional[bool] = False,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
# Raise error when position_ids is None but rotary_pos_emb is provided, because we need that when applying
# rotary_pos_emb to query and key states.
if rotary_pos_emb is not None and position_ids is None:
raise ValueError("`position_ids` must be provided when `rotary_pos_emb` is not None.")
bsz, _, embed_dim = hidden_states.size()
# get query proj
query_states = self._shape(self.q_proj(hidden_states), -1, bsz) * self.scale
key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
if past_key_value is not None:
past_key, past_value = past_key_value
key_states = torch.cat((past_key, key_states), dim=-2)
value_states = torch.cat((past_value, value_states), dim=-2)
if use_cache is True:
present = (key_states, value_states)
else:
present = None
if rotary_pos_emb is not None:
rotary_emb_dim = rotary_pos_emb.shape[-1]
# Partial rotary embedding
query_rot, query_pass = (
query_states[..., :rotary_emb_dim],
query_states[..., rotary_emb_dim:],
)
key_rot, key_pass = (
key_states[..., :rotary_emb_dim],
key_states[..., rotary_emb_dim:],
)
cos, sin = rotary_pos_emb.cos().squeeze(0), rotary_pos_emb.sin().squeeze(0)
query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
# [batch_size, num_heads, seq_length, head_dim]
query_states = torch.cat((query_rot, query_pass), dim=-1)
key_states = torch.cat((key_rot, key_pass), dim=-1)
tgt_len = query_states.shape[2]
src_len = key_states.shape[2]
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
)
attn_weights = attn_weights + attention_mask
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
# Mask heads if we want to
if head_mask is not None:
attn_weights = attn_weights * head_mask
attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
attn_output = torch.matmul(attn_probs, value_states)
if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
attn_output = self.out_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, present, attn_weights
class ClvpGatedLinearUnit(nn.Module):
"""
`ClvpGatedLinearUnit` uses the second half of the `hidden_states` to act as a gate for the first half of the
`hidden_states` which controls the flow of data from the first of the tensor.
"""
def __init__(self, config):
super().__init__()
self.activation_fn = ACT2FN[config.hidden_act]
self.proj = nn.Linear(config.hidden_size, config.intermediate_size * 2)
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
return hidden_states * self.activation_fn(gate)
class ClvpEncoderMLP(nn.Module):
"""
This MLP is used in CLVP speech or text encoder models.
"""
def __init__(self, config):
super().__init__()
self.config = config
self.fc1 = ClvpGatedLinearUnit(config)
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
self.dropout_layer = nn.Dropout(config.dropout)
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
hidden_states = self.fc1(hidden_states)
hidden_states = self.dropout_layer(hidden_states)
hidden_states = self.fc2(hidden_states)
return hidden_states
class ClvpEncoderLayer(nn.Module):
def __init__(self, config: ClvpConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.self_attn = ClvpSelfAttention(config)
self.mlp = ClvpEncoderMLP(config)
self.input_rmsnorm = ClvpRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
self.post_attention_rmsnorm = ClvpRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
hidden_states: torch.FloatTensor,
rotary_pos_emb: torch.FloatTensor,
attention_mask: torch.LongTensor,
position_ids: torch.LongTensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]:
"""
Args:
hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, embed_dim)`):
input to the layer.
rotary_pos_emb (`torch.FloatTensor`):
rotary position embeddings generated by `ClvpRotaryPositionalEmbedding` module.
attention_mask (`torch.FloatTensor` of shape `(batch, 1, tgt_len, src_len)`):
attention mask where padding elements are indicated by very large negative values.
position_ids (`torch.LongTensor`):
Denotes position ids of the input tokens.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.input_rmsnorm(hidden_states)
attention_outputs = self.self_attn(
hidden_states=hidden_states,
rotary_pos_emb=rotary_pos_emb,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
)
hidden_states = attention_outputs[0]
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.post_attention_rmsnorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (attention_outputs[-1],)
return outputs
# Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP with GPT2->ClvpDecoderMLP
class ClvpDecoderMLP(nn.Module):
def __init__(self, intermediate_size, config):
super().__init__()
embed_dim = config.hidden_size
self.c_fc = Conv1D(intermediate_size, embed_dim)
self.c_proj = Conv1D(embed_dim, intermediate_size)
self.act = ACT2FN[config.activation_function]
self.dropout = nn.Dropout(config.resid_pdrop)
def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
hidden_states = self.c_fc(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.c_proj(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class ClvpDecoderLayer(nn.Module):
def __init__(self, config):
super().__init__()
hidden_size = config.hidden_size
inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
self.input_layernorm = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.attn = ClvpSelfAttention(config)
self.post_attention_layernorm = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.mlp = ClvpDecoderMLP(inner_dim, config)
def forward(
self,
hidden_states: Optional[Tuple[torch.FloatTensor]],
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
attn_outputs = self.attn(
hidden_states,
past_key_value=past_key_value,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
)
attn_output = attn_outputs[0]
outputs = attn_outputs[1:]
# residual connection
hidden_states = attn_output + residual
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
feed_forward_hidden_states = self.mlp(hidden_states)
# residual connection
hidden_states = residual + feed_forward_hidden_states
if use_cache:
outputs = (hidden_states,) + outputs
else:
outputs = (hidden_states,) + outputs[1:]
return outputs
class ClvpConditioningEncoder(nn.Module):
"""
This class processes the log-mel spectrograms(extracted by the Feature Extractor) and text tokens(produced by the
tokenizer) as inputs for the decoder model.
First each log-mel spectrogram is processed into a single vector which captures valuable characteristics from each
of them, then the text tokens are converted into token embeddings and position embeddings are added afterwards.
Both of these vectors are concatenated and then passed to the decoder model.
The text tokens helps to incorporate the "text information" and the log-mel spectrogram is used to specify the
"voice characteristics" into the generated mel tokens.
"""
def __init__(self, config: ClvpConfig):
super().__init__()
self.text_config = config.text_config
self.decoder_config = config.decoder_config
self.text_token_embedding = nn.Embedding(self.text_config.vocab_size, self.decoder_config.hidden_size)
self.text_position_embedding = nn.Embedding(
self.decoder_config.max_text_tokens, self.decoder_config.hidden_size
)
self.mel_conv = nn.Conv1d(self.decoder_config.feature_size, self.decoder_config.hidden_size, kernel_size=1)
# define group norms to be used before each attention layer
num_groups = self.compute_groupnorm_groups(self.decoder_config.hidden_size)
self.group_norms = nn.ModuleList(
[
nn.GroupNorm(num_groups, self.decoder_config.hidden_size, eps=1e-5, affine=True)
for _ in range(self.decoder_config.num_mel_attn_blocks)
]
)
# define the attention layers
self.mel_attn_blocks = nn.ModuleList(
[ClvpSelfAttention(self.decoder_config) for _ in range(self.decoder_config.num_mel_attn_blocks)]
)
self.gradient_checkpointing = False
def compute_groupnorm_groups(self, channels: int, groups: int = 32):
"""
Calculates the value of `num_groups` for nn.GroupNorm. This logic is taken from the official tortoise
repository. link :
https://github.com/neonbjb/tortoise-tts/blob/4003544b6ff4b68c09856e04d3eff9da26d023c2/tortoise/models/arch_util.py#L26
"""
if channels <= 16:
groups = 8
elif channels <= 64:
groups = 16
while channels % groups != 0:
groups = int(groups / 2)
if groups <= 2:
raise ValueError(
f"Number of groups for the GroupNorm must be greater than 2, but it is {groups}."
f"Please consider using a different `hidden_size`"
)
return groups
def forward(
self,
input_features: torch.FloatTensor,
input_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
):
# process text
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
# We add bos and eos input_ids in the modeling file instead of the tokenizer file to keep the logic simple
# This logic is specific to ClvpConditioningEncoder and not used by other modules.
input_ids = torch.nn.functional.pad(input_ids, (1, 0), value=self.text_config.bos_token_id)
input_ids = torch.nn.functional.pad(input_ids, (0, 1), value=self.text_config.eos_token_id)
batch_size, seq_length = input_ids.size()
inputs_embeds = self.text_token_embedding(input_ids)
# check if we need to update attention mask, if yes then pad it too
if attention_mask is not None and attention_mask.shape[1] != seq_length:
attention_mask = torch.nn.functional.pad(attention_mask, (1, 0), value=1)
attention_mask = torch.nn.functional.pad(attention_mask, (0, 1), value=1)
elif inputs_embeds is not None:
batch_size, seq_length = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
# construct attention mask if not given
if attention_mask is None:
attention_mask = torch.ones([batch_size, seq_length], dtype=torch.long, device=inputs_embeds.device)
position_ids = attention_mask.cumsum(-1) - 1
position_embeds = self.text_position_embedding(position_ids)
text_embeds = inputs_embeds + position_embeds
if self.gradient_checkpointing and self.training:
# process each log-mel spectrogram into a single vector
mel_spec = torch.utils.checkpoint.checkpoint(self.mel_conv, input_features)
for i, mel_attn_block in enumerate(self.mel_attn_blocks):
residual_mel_spec = mel_spec.transpose(1, 2)
mel_spec = torch.utils.checkpoint.checkpoint(self.group_norms[i], mel_spec).transpose(1, 2)
mel_spec = torch.utils.checkpoint.checkpoint(mel_attn_block, mel_spec)[0] + residual_mel_spec
mel_spec = mel_spec.transpose(1, 2)
else:
# process each log-mel spectrogram into a single vector
mel_spec = self.mel_conv(input_features)
for i, mel_attn_block in enumerate(self.mel_attn_blocks):
residual_mel_spec = mel_spec.transpose(1, 2)
mel_spec = self.group_norms[i](mel_spec).transpose(1, 2)
mel_spec = mel_attn_block(mel_spec)[0] + residual_mel_spec
mel_spec = mel_spec.transpose(1, 2)
mel_spec = mel_spec[:, :, 0]
mel_spec = mel_spec.unsqueeze(1)
# repeat if there is either (1 text vs N audios) or (N texts vs 1 audio)
if text_embeds.shape[0] == 1 and mel_spec.shape[0] != 1:
text_embeds = text_embeds.repeat(mel_spec.shape[0], 1, 1)
elif text_embeds.shape[0] != 1 and mel_spec.shape[0] == 1:
mel_spec = mel_spec.repeat(text_embeds.shape[0], 1, 1)
# If there is N texts and M audios we will raise error since the number of text and audio must be same.
elif text_embeds.shape[0] != mel_spec.shape[0]:
raise ValueError(
f"The number of texts and number of audios must be same. "
f"Found {text_embeds.shape[0]} texts vs {mel_spec.shape[0]} audios"
)
return torch.concat([mel_spec, text_embeds], dim=1)
class ClvpPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = ClvpConfig
base_model_prefix = "clvp"
supports_gradient_checkpointing = True
_skip_keys_device_placement = "past_key_values"
def _init_weights(self, module):
"""Initialize the weights"""
factor = self.config.initializer_factor
if isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=factor * 0.02)
elif isinstance(module, (nn.Linear, Conv1D, nn.Conv1d)):
module.weight.data.normal_(mean=0.0, std=factor * 0.02)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, ClvpEncoderMLP):
factor = self.config.initializer_factor
in_proj_std = (
(module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
)
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
nn.init.normal_(module.fc1.proj.weight if getattr(module.fc1, "proj") else module.fc1.weight, std=fc_std)
nn.init.normal_(module.fc2.weight, std=in_proj_std)
elif isinstance(module, ClvpEncoder):
config = self.config.text_config if hasattr(self.config, "text_config") else self.config
factor = config.initializer_factor
module.projection.weight.data.normal_(mean=0.0, std=factor * (config.hidden_size**-0.5))
elif isinstance(module, ClvpConditioningEncoder):
module.mel_conv.weight.data.normal_(mean=0.0, std=factor)
module.mel_conv.bias.data.zero_()
elif isinstance(module, ClvpForCausalLM):
for name, p in module.named_parameters():
if name == "c_proj.weight":
p.data.normal_(
mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.num_hidden_layers))
)
if isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
CLVP_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`ClvpConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
CLVP_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, time_dim)`):
Indicates log mel-spectrogram representations for audio returned by [`ClvpFeatureExtractor`].
conditioning_encoder_inputs_embeds (`torch.FloatTensor`, *optional*):
inputs_embeds for `ClvpConditioningEncoder`. Can be used in place of `input_ids`.
text_encoder_inputs_embeds (`torch.FloatTensor`, *optional*):
inputs_embeds for the text encoder model passed in place of `input_ids`.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding text token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
CLVP_DECODER_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
`past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
their past given to this model should not be passed as `input_ids` as they have already been computed.
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for
`past_key_values`. In other words, the `attention_mask` always has to have the length:
`len(past_key_values) + len(input_ids)`
[What are attention masks?](../glossary#attention-mask)
token_type_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
1]`:
- 0 corresponds to a *sentence A* token,
- 1 corresponds to a *sentence B* token.
[What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
`past_key_values`).
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
class ClvpEncoder(ClvpPreTrainedModel):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`ClvpEncoderLayer`].
Args:
config: ClvpConfig
"""
def __init__(self, config: ClvpConfig):
super().__init__(config)
self.config = config
self.token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
self.rotary_pos_emb = ClvpRotaryPositionalEmbedding(config) if config.use_rotary_embedding else None
self.layers = nn.ModuleList([ClvpEncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.sequence_summary = SequenceSummary(config)
self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
self.gradient_checkpointing = False
self.post_init()
def get_input_embeddings(self):
return self.token_embedding
def set_input_embeddings(self, value):
self.token_embedding = value
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
input embeddings for the model. This bypasses the model's internal embedding lookup matrix.
attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor`, *optional*):
Denotes the position ids of `input_ids`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_shape[-1])
inputs_embeds = self.token_embedding(input_ids)
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
# expand attention_mask and create position_ids if needed
if attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
if position_ids is None:
device = input_ids.device if input_ids is not None else inputs_embeds.device
position_ids = torch.arange(input_shape[1], dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0)
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
rotary_pos_emb = self.rotary_pos_emb(inputs_embeds) if self.rotary_pos_emb is not None else None
hidden_states = inputs_embeds
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = torch.utils.checkpoint.checkpoint(
encoder_layer.__call__,
hidden_states,
rotary_pos_emb,
attention_mask,
position_ids,
)
else:
layer_outputs = encoder_layer(
hidden_states,
rotary_pos_emb,
attention_mask,
position_ids,
output_attentions=output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
last_hidden_state = hidden_states
last_hidden_state = self.final_layer_norm(last_hidden_state)
# take the mean over axis 1 and get pooled output
pooled_output = self.sequence_summary(last_hidden_state)
# apply the projection layer
embeds = self.projection(pooled_output)
if not return_dict:
return tuple(
v for v in [embeds, last_hidden_state, pooled_output, encoder_states, all_attentions] if v is not None
)
return ClvpEncoderOutput(
embeds=embeds,
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_states,
attentions=all_attentions,
)
class ClvpDecoder(ClvpPreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ClvpDecoderLayer`]
"""
def __init__(self, config):
super().__init__(config)
self.config = config
self.input_embeds_layer = nn.Embedding(self.config.vocab_size, self.config.hidden_size)
self.position_embeds_layer = nn.Embedding(self.config.max_position_embeddings, self.config.hidden_size)
self.drop = nn.Dropout(self.config.embd_pdrop)
self.layers = nn.ModuleList([ClvpDecoderLayer(self.config) for _ in range(self.config.num_hidden_layers)])
self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=self.config.layer_norm_epsilon)
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.input_embeds_layer
def set_input_embeddings(self, new_embeddings):
self.input_embeds_layer = new_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
"""
for layer, heads in heads_to_prune.items():
self.layers[layer].attn.prune_heads(heads)
@add_start_docstrings_to_model_forward(CLVP_DECODER_INPUTS_DOCSTRING)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_shape[-1])
input_ids.shape[0]
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
inputs_embeds.shape[0]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
if token_type_ids is not None:
token_type_ids = token_type_ids.view(-1, input_shape[-1])
if past_key_values is None:
past_key_values_length = 0
past_key_values = tuple([None] * len(self.layers))
else:
past_key_values_length = past_key_values[0][0].size(-2)
if position_ids is None:
position_ids = torch.arange(
past_key_values_length, input_shape[-1] + past_key_values_length, dtype=torch.long, device=device
)
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
if inputs_embeds is None:
inputs_embeds = self.input_embeds_layer(input_ids)
position_embeds = self.position_embeds_layer(position_ids)
inputs_embeds = inputs_embeds + position_embeds
attention_mask = _prepare_4d_causal_attention_mask(
attention_mask, input_shape, inputs_embeds, past_key_values_length
)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x num_attention_heads x N x N
# head_mask has shape num_hidden_layers x batch x num_attention_heads x N x N
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
hidden_states = inputs_embeds
if token_type_ids is not None:
token_type_embeds = self.input_embeds_layer(token_type_ids)
hidden_states = hidden_states + token_type_embeds
hidden_states = self.drop(hidden_states)
output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
presents = () if use_cache else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
all_hidden_states = () if output_hidden_states else None
for i, (block, past_key_value) in enumerate(zip(self.layers, past_key_values)):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
outputs = torch.utils.checkpoint.checkpoint(
block.__call__,
hidden_states,
None,
attention_mask,
position_ids,
head_mask[i],
)
else:
outputs = block(
hidden_states,
past_key_value=past_key_value,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask[i],
use_cache=use_cache,
output_attentions=output_attentions,
)
hidden_states = outputs[0]
if use_cache is True:
presents = presents + (outputs[1],)
if output_attentions:
all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
hidden_states = self.layer_norm(hidden_states)
hidden_states = hidden_states.view(output_shape)
# Add last hidden state
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=presents,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
@add_start_docstrings(
"The bare Clvp decoder model outputting raw hidden-states without any specific head on top.",
CLVP_START_DOCSTRING,
)
class ClvpModel(ClvpPreTrainedModel):
def __init__(self, config: ClvpDecoderConfig):
super().__init__(config)
self.config = config
self.decoder = ClvpDecoder(self.config)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.decoder.input_embeds_layer
def set_input_embeddings(self, value):
self.decoder.input_embeds_layer = value
def get_decoder(self):
return self.decoder
@add_start_docstrings_to_model_forward(CLVP_DECODER_INPUTS_DOCSTRING)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
decoder_outputs = self.decoder(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
return decoder_outputs
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
hidden_states=decoder_outputs.hidden_states,
attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
)
@add_start_docstrings(
"The CLVP decoder model with a language modelling head on top.",
CLVP_START_DOCSTRING,
)
class ClvpForCausalLM(ClvpPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.model = ClvpModel(self.config)
self.final_norm = nn.LayerNorm(self.config.hidden_size)
self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=True)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.input_embeds_layer
def set_input_embeddings(self, new_embeddings):
self.model.decoder.input_embeds_layer = new_embeddings
def _prepare_model_inputs(
self,
inputs: Optional[torch.Tensor] = None,
bos_token_id: Optional[int] = None,
model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
) -> Tuple[torch.Tensor, Optional[str], Dict[str, torch.Tensor]]:
"""
This function extracts the model-specific `inputs` for generation.
"""
input_name = self.main_input_name
model_kwargs = {k: v for k, v in model_kwargs.items() if v is not None}
inputs_kwarg = model_kwargs.pop(input_name, None)
if inputs_kwarg is not None and inputs is not None:
raise ValueError(
f"`inputs`: {inputs}` were passed alongside {input_name} which is not allowed."
f"Make sure to either pass {inputs} or {input_name}=..."
)
elif inputs_kwarg is not None:
inputs = inputs_kwarg
if input_name == "input_ids" and "inputs_embeds" in model_kwargs:
model_kwargs["input_ids"] = self._maybe_initialize_input_ids_for_generation(
inputs, bos_token_id, model_kwargs=model_kwargs
)
inputs, input_name = model_kwargs["inputs_embeds"], "inputs_embeds"
# Check if conditioning_embeds are provided or not, if yes then concatenate the bos_token_id at the end of the conditioning_embeds.
# Then we must subtract the positional_ids because during the forward pass it will be added anyways, so we must cancel them out here.
conditioning_embeds = model_kwargs.get("conditioning_embeds", None)
if conditioning_embeds is not None:
mel_start_token_embedding = self.model.decoder.input_embeds_layer(
torch.full(
(conditioning_embeds.shape[0], 1),
fill_value=self.config.bos_token_id,
device=conditioning_embeds.device,
)
)
mel_start_token_embedding += self.model.decoder.position_embeds_layer(
torch.full((conditioning_embeds.shape[0], 1), fill_value=0, device=conditioning_embeds.device)
)
conditioning_embeds = torch.concat([conditioning_embeds, mel_start_token_embedding], dim=1)
# subtract the positional_ids here
if hasattr(model_kwargs, "attention_mask"):
position_ids = model_kwargs["attention_mask"].long().cumsum(-1) - 1
else:
position_ids = torch.range(
0, conditioning_embeds.shape[1] - 1, dtype=torch.long, device=conditioning_embeds.device
)
position_ids = position_ids.unsqueeze(0).repeat(conditioning_embeds.shape[0], 1)
model_kwargs["inputs_embeds"] = conditioning_embeds - self.model.decoder.position_embeds_layer(
position_ids
)
model_kwargs["input_ids"] = (
torch.ones((model_kwargs["inputs_embeds"].shape[0], 1), dtype=torch.long, device=self.device)
* self.config.bos_token_id
)
return model_kwargs["inputs_embeds"], "inputs_embeds", model_kwargs
inputs = self._maybe_initialize_input_ids_for_generation(inputs, bos_token_id, model_kwargs)
return inputs, input_name, model_kwargs
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, inputs_embeds=None, conditioning_embeds=None, **kwargs
):
input_ids_length = input_ids.shape[-1]
token_type_ids = kwargs.get("token_type_ids", None)
# only last token for inputs_ids if past is defined in kwargs
if past_key_values:
past_length = past_key_values[0][0].shape[2]
# Some generation methods already pass only the last input ID
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# Default to old behavior: keep only final ID
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
if token_type_ids is not None:
token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
attention_mask = kwargs.get("attention_mask", None)
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -1].unsqueeze(-1)
else:
position_ids = None
if conditioning_embeds is not None and past_key_values is not None:
position_ids = torch.tensor([input_ids_length], dtype=torch.long, device=input_ids.device)
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"position_ids": position_ids,
"token_type_ids": token_type_ids,
}
)
return model_inputs
@add_start_docstrings_to_model_forward(CLVP_DECODER_INPUTS_DOCSTRING)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
token_type_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.model(
input_ids=input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
lm_logits = self.final_norm(hidden_states)
lm_logits = self.lm_head(lm_logits)
loss = None
if labels is not None:
labels = labels.to(lm_logits.device)
# Shift so that tokens < n predict n
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return CausalLMOutputWithCrossAttentions(
loss=loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
@staticmethod
# Copied from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache
def _reorder_cache(
past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
) -> Tuple[Tuple[torch.Tensor]]:
"""
This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
[`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
beam_idx at every generation step.
"""
return tuple(
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
for layer_past in past_key_values
)
@add_start_docstrings(
"The composite CLVP model with a text encoder, speech encoder and speech decoder model."
"The speech decoder model generates the speech_ids from the text and the text encoder and speech encoder works"
"together to filter out the best speech_ids.",
CLVP_START_DOCSTRING,
)
class ClvpModelForConditionalGeneration(ClvpPreTrainedModel):
config_class = ClvpConfig
def __init__(self, config: ClvpConfig):
super().__init__(config)
if not isinstance(config.text_config, ClvpEncoderConfig):
raise ValueError(
"config.text_config is expected to be of type `ClvpEncoderConfig` but is of type"
f" {type(config.text_config)}."
)
if not isinstance(config.speech_config, ClvpEncoderConfig):
raise ValueError(
"config.speech_config is expected to be of type `ClvpEncoderConfig` but is of type"
f" {type(config.speech_config)}."
)
if not isinstance(config.decoder_config, ClvpDecoderConfig):
raise ValueError(
"config.decoder_config is expected to be of type `ClvpDecoderConfig` but is of type"
f" {type(config.decoder_config)}."
)
self.conditioning_encoder = ClvpConditioningEncoder(config)
self.speech_decoder_model = ClvpForCausalLM(config.decoder_config)
self.text_encoder_model = ClvpEncoder(config.text_config)
self.speech_encoder_model = ClvpEncoder(config.speech_config)
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
# Initialize weights and apply final processing
self.post_init()
# taken from the original repo,
# link : https://github.com/neonbjb/tortoise-tts/blob/4003544b6ff4b68c09856e04d3eff9da26d023c2/tortoise/api.py#L117
def fix_speech_decoder_output(self, speech_ids: torch.LongTensor) -> torch.LongTensor:
"""
This method modifies the output of the decoder model, such as replacing the `eos_token_id` and changing the
last few tokens of each sequence.
Args:
speech_ids (`torch.LongTensor`):
This refers to the output of the decoder model.
"""
decoder_fixing_codes = self.config.decoder_config.decoder_fixing_codes
speech_ids = speech_ids[:, 1:]
if torch.isin(self.speech_decoder_model.config.eos_token_id, speech_ids):
speech_ids = torch.nn.functional.pad(
speech_ids, pad=(0, 1), value=self.speech_decoder_model.config.eos_token_id
)
stop_token_indices = torch.where(speech_ids == self.speech_decoder_model.config.eos_token_id, 1, 0)
speech_ids = torch.masked_fill(speech_ids, mask=stop_token_indices.bool(), value=decoder_fixing_codes[0])
for i, each_seq_stop_token_index in enumerate(stop_token_indices):
# This means that no stop tokens were found so the sentence was still being generated, in that case we don't need
# to apply any padding so just skip to the next sequence of tokens.
if each_seq_stop_token_index.sum() == 0:
continue
stm = each_seq_stop_token_index.argmax()
speech_ids[i, stm:] = decoder_fixing_codes[0]
if stm - 3 < speech_ids.shape[1]:
speech_ids[i, -3:] = torch.tensor(
[decoder_fixing_codes[1:]], device=speech_ids.device, dtype=torch.long
)
return speech_ids
def get_text_features(
self,
input_ids: Optional[torch.LongTensor] = None,
text_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
) -> torch.FloatTensor:
r"""
This method can be used to extract text_embeds from a text. The text embeddings obtained by applying the
projection layer to the pooled output of the CLVP text encoder model.
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
[What are input IDs?](../glossary#input-ids)
text_encoder_inputs_embeds (`torch.FloatTensor`, *optional*):
inputs_embeds for the text encoder model passed in place of `input_ids`.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
Returns:
`torch.FloatTensor` of shape `(batch_size, output_dim)`:
The text embeddings obtained by applying the projection layer to the pooled output of the CLVP Text
Model.
Examples:
```python
>>> from transformers import ClvpProcessor, ClvpModelForConditionalGeneration
>>> # Define the Text
>>> text = "This is an example text."
>>> # Define processor and model
>>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
>>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
>>> # Generate processor output and text embeds
>>> processor_output = processor(text=text, return_tensors="pt")
>>> text_embeds = model.get_text_features(input_ids=processor_output["input_ids"])
```
"""
outputs = self.text_encoder_model(
input_ids=input_ids,
inputs_embeds=text_encoder_inputs_embeds,
attention_mask=attention_mask,
)
return outputs[0]
def get_speech_features(
self,
speech_ids: Optional[torch.LongTensor] = None,
input_ids: Optional[torch.LongTensor] = None,
input_features: Optional[torch.FloatTensor] = None,
conditioning_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
generation_config: Optional[GenerationConfig] = None,
**kwargs,
) -> torch.FloatTensor:
r"""
This method can be used to extract speech_embeds. The speech embeddings are obtained by applying the speech
model on speech_ids. If speech_ids is not present but both input_ids and input_features are given then the
decoder model will be used to first generate the speech_ids and then applying the speech model.
Args:
speech_ids (`torch.LongTensor` of shape `(batch_size, num_speech_ids)`, *optional*):
Speech Tokens. Padding will be ignored by default should you provide it. If speech_ids are provided
then input_ids and input_features will be automatically ignored.
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Input text Tokens. Processed from the [`ClvpTokenizer`]. If speech_ids is not provided, then input_ids
and input_features will be used.
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, time_dim)`, *optional*):
Indicates log-melspectrogram representations for audio returned by [`ClvpFeatureExtractor`]. If
speech_ids is not provided, then input_ids and input_features will be used.
conditioning_encoder_inputs_embeds (`torch.FloatTensor`, *optional*):
inputs_embeds for `ClvpConditioningEncoder`. Can be used in place of `input_ids`.
attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding speech token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
generation_config (`GenerationConfig`, *optional*):
generation config to control the generation of speech_ids if they are not provided.
Returns:
`torch.FloatTensor` of shape `(batch_size, output_dim)`:
The speech embeddings obtained by applying the projection layer to the pooled output of the CLVP Speech
Model.
Examples:
```python
>>> import datasets
>>> from transformers import ClvpProcessor, ClvpModelForConditionalGeneration
>>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library)
>>> text = "This is an example text."
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
>>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
>>> # Define processor and model
>>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
>>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
>>> # Generate processor output and model output
>>> processor_output = processor(raw_speech=audio, sampling_rate=sr, text=text, return_tensors="pt")
>>> speech_embeds = model.get_speech_features(
... input_ids=processor_output["input_ids"], input_features=processor_output["input_features"]
... )
```
"""
if speech_ids is None:
if (input_ids is None and conditioning_encoder_inputs_embeds is None) or input_features is None:
raise ValueError(
"Either speech_ids or input_ids/conditioning_encoder_inputs_embeds and input_features must be provided."
)
if generation_config is None:
generation_config = self.generation_config
generation_config.update(**kwargs)
conditioning_embeds = self.conditioning_encoder(
input_features=input_features,
input_ids=input_ids,
inputs_embeds=conditioning_encoder_inputs_embeds,
attention_mask=attention_mask,
)
speech_ids = self.speech_decoder_model.generate(
conditioning_embeds=conditioning_embeds,
generation_config=generation_config,
)
speech_ids = self.fix_speech_decoder_output(speech_ids[0])
outputs = self.speech_encoder_model(
input_ids=speech_ids,
attention_mask=attention_mask,
)
return outputs[0]
@add_start_docstrings_to_model_forward(CLVP_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ClvpOutput, config_class=ClvpConfig)
def forward(
self,
input_ids: torch.LongTensor = None,
input_features: torch.FloatTensor = None,
conditioning_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,
text_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.LongTensor] = None,
return_loss: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, ClvpOutput]:
r"""
Returns:
Examples:
```python
>>> import datasets
>>> from transformers import ClvpProcessor, ClvpModelForConditionalGeneration
>>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library)
>>> text = "This is an example text."
>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
>>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
>>> # Define processor and model
>>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
>>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
>>> # processor outputs and model outputs
>>> processor_output = processor(raw_speech=audio, sampling_rate=sr, text=text, return_tensors="pt")
>>> outputs = model(
... input_ids=processor_output["input_ids"],
... input_features=processor_output["input_features"],
... return_dict=True,
... )
```
"""
# Use CLVP model's config for some fields (if specified) instead of those of speech & text components.
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
conditioning_embeds = self.conditioning_encoder(
input_features=input_features,
input_ids=input_ids,
inputs_embeds=conditioning_encoder_inputs_embeds,
attention_mask=attention_mask,
)
decoder_outputs = self.speech_decoder_model(
inputs_embeds=conditioning_embeds,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
speech_ids = decoder_outputs[0]
# since we will get the embeds of shape `(batch_size, seq_len, embedding_dim)` during the forward pass
# we must convert it to tokens, to make it compaitable with speech_transformer
if speech_ids.ndim == 3:
speech_ids = speech_ids.argmax(2)
speech_ids = self.fix_speech_decoder_output(speech_ids)
speech_outputs = self.speech_encoder_model(
input_ids=speech_ids,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
text_outputs = self.text_encoder_model(
input_ids=input_ids,
inputs_embeds=text_encoder_inputs_embeds,
attention_mask=attention_mask,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
speech_embeds = speech_outputs[0]
text_embeds = text_outputs[0]
# normalized features
speech_embeds = speech_embeds / speech_embeds.norm(p=2, dim=-1, keepdim=True)
text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
# cosine similarity as logits
logit_scale = self.logit_scale.exp()
logits_per_text = torch.matmul(text_embeds, speech_embeds.t()) * logit_scale
logits_per_speech = logits_per_text.t()
loss = None
if return_loss:
loss = clvp_loss(logits_per_text)
if not return_dict:
output = (
logits_per_speech,
logits_per_text,
text_embeds,
speech_embeds,
text_outputs[2],
speech_outputs[2],
)
if output_hidden_states:
output += (
decoder_outputs[-1],
text_outputs[-1],
speech_outputs[-1],
)
return ((loss,) + output) if loss is not None else output
return ClvpOutput(
loss=loss,
logits_per_speech=logits_per_speech,
logits_per_text=logits_per_text,
text_embeds=text_embeds,
speech_embeds=speech_embeds,
text_model_output=text_outputs[2],
speech_model_output=speech_outputs[2],
decoder_hidden_states=decoder_outputs.hidden_states,
text_encoder_hidden_states=text_outputs.hidden_states,
speech_encoder_hidden_states=speech_outputs.hidden_states,
)
@torch.no_grad()
def generate(
self,
input_ids: torch.LongTensor = None,
input_features: torch.FloatTensor = None,
attention_mask: Optional[torch.LongTensor] = None,
generation_config: Optional[GenerationConfig] = None,
output_hidden_states: Optional[bool] = None,
**kwargs,
):
"""
Generate method for `ClvpModelForConditionalGeneration`, this method calls the `generate` method of
`ClvpForCausalLM` and then uses those generated `speech_ids` to process `text_embeds` and `speech_embeds` using
`ClvpEncoder`.
Args:
input_ids (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Input text Tokens. Processed from the [`ClvpTokenizer`].
input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, time_dim)`, *optional*):
Indicates log-melspectrogram representations for audio returned by [`ClvpFeatureExtractor`].
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding text token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
generation_config (`~generation.GenerationConfig`, *optional*):
The generation configuration to be used as base parametrization for the generation call. `**kwargs`
passed to generate matching the attributes of `generation_config` will override them. If
`generation_config` is not provided, the default will be used, which had the following loading
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
default values, whose documentation should be checked to parameterize generation.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of decoder model, text encoder and speech encoder models.
Returns:
`ClvpOutput` or tuple: A `ClvpOutput` (if `return_dict_in_generate=True` or when
`config.return_dict_in_generate=True`) or a tuple.
"""
if generation_config is None:
generation_config = self.generation_config
generation_config = copy.deepcopy(generation_config)
model_kwargs = generation_config.update(**kwargs) # All unused kwargs must be model kwargs
generation_config.validate()
self._validate_model_kwargs(model_kwargs.copy())
conditioning_embeds = self.conditioning_encoder(
input_features=input_features,
input_ids=input_ids,
attention_mask=attention_mask,
)
decoder_outputs = self.speech_decoder_model.generate(
conditioning_embeds=conditioning_embeds,
generation_config=generation_config,
output_hidden_states=output_hidden_states,
return_dict=generation_config.return_dict_in_generate,
)
if isinstance(decoder_outputs, ModelOutput):
speech_ids = decoder_outputs.sequences
speech_ids = self.fix_speech_decoder_output(speech_ids)
speech_outputs = self.speech_encoder_model(
input_ids=speech_ids,
output_hidden_states=output_hidden_states,
return_dict=generation_config.return_dict_in_generate,
)
text_outputs = self.text_encoder_model(
input_ids=input_ids,
attention_mask=attention_mask,
output_hidden_states=output_hidden_states,
return_dict=generation_config.return_dict_in_generate,
)
speech_embeds = speech_outputs[0]
text_embeds = text_outputs[0]
# normalized features
speech_embeds = speech_embeds / speech_embeds.norm(p=2, dim=-1, keepdim=True)
text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
# cosine similarity as logits
logit_scale = self.logit_scale.exp()
logits_per_text = torch.matmul(text_embeds, speech_embeds.t()) * logit_scale
logits_per_speech = logits_per_text.t()
if not generation_config.return_dict_in_generate:
output = (
speech_ids,
logits_per_speech,
logits_per_text,
text_embeds,
speech_embeds,
text_outputs[2],
speech_outputs[2],
)
if output_hidden_states:
output += (
decoder_outputs[-1],
text_outputs[-1],
speech_outputs[-1],
)
return output
return ClvpOutput(
speech_ids=speech_ids,
logits_per_speech=logits_per_speech,
logits_per_text=logits_per_text,
text_embeds=text_embeds,
speech_embeds=speech_embeds,
text_model_output=text_outputs[2],
speech_model_output=speech_outputs[2],
decoder_hidden_states=decoder_outputs.hidden_states,
text_encoder_hidden_states=text_outputs.hidden_states,
speech_encoder_hidden_states=speech_outputs.hidden_states,
)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""English Normalizer class for CLVP."""
import re
class EnglishNormalizer:
def __init__(self):
# List of (regular expression, replacement) pairs for abbreviations:
self._abbreviations = [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("mrs", "misess"),
("mr", "mister"),
("dr", "doctor"),
("st", "saint"),
("co", "company"),
("jr", "junior"),
("maj", "major"),
("gen", "general"),
("drs", "doctors"),
("rev", "reverend"),
("lt", "lieutenant"),
("hon", "honorable"),
("sgt", "sergeant"),
("capt", "captain"),
("esq", "esquire"),
("ltd", "limited"),
("col", "colonel"),
("ft", "fort"),
]
]
self.ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
self.teens = [
"ten",
"eleven",
"twelve",
"thirteen",
"fourteen",
"fifteen",
"sixteen",
"seventeen",
"eighteen",
"nineteen",
]
self.tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
def number_to_words(self, num: int) -> str:
"""
Converts numbers(`int`) to words(`str`).
Please note that it only supports upto - "'nine hundred ninety-nine quadrillion, nine hundred ninety-nine
trillion, nine hundred ninety-nine billion, nine hundred ninety-nine million, nine hundred ninety-nine
thousand, nine hundred ninety-nine'" or `number_to_words(999_999_999_999_999_999)`.
"""
if num == 0:
return "zero"
elif num < 0:
return "minus " + self.number_to_words(abs(num))
elif num < 10:
return self.ones[num]
elif num < 20:
return self.teens[num - 10]
elif num < 100:
return self.tens[num // 10] + ("-" + self.number_to_words(num % 10) if num % 10 != 0 else "")
elif num < 1000:
return (
self.ones[num // 100] + " hundred" + (" " + self.number_to_words(num % 100) if num % 100 != 0 else "")
)
elif num < 1_000_000:
return (
self.number_to_words(num // 1000)
+ " thousand"
+ (", " + self.number_to_words(num % 1000) if num % 1000 != 0 else "")
)
elif num < 1_000_000_000:
return (
self.number_to_words(num // 1_000_000)
+ " million"
+ (", " + self.number_to_words(num % 1_000_000) if num % 1_000_000 != 0 else "")
)
elif num < 1_000_000_000_000:
return (
self.number_to_words(num // 1_000_000_000)
+ " billion"
+ (", " + self.number_to_words(num % 1_000_000_000) if num % 1_000_000_000 != 0 else "")
)
elif num < 1_000_000_000_000_000:
return (
self.number_to_words(num // 1_000_000_000_000)
+ " trillion"
+ (", " + self.number_to_words(num % 1_000_000_000_000) if num % 1_000_000_000_000 != 0 else "")
)
elif num < 1_000_000_000_000_000_000:
return (
self.number_to_words(num // 1_000_000_000_000_000)
+ " quadrillion"
+ (
", " + self.number_to_words(num % 1_000_000_000_000_000)
if num % 1_000_000_000_000_000 != 0
else ""
)
)
else:
return "number out of range"
def convert_to_ascii(self, text: str) -> str:
"""
Converts unicode to ascii
"""
return text.encode("ascii", "ignore").decode("utf-8")
def _expand_dollars(self, m: str) -> str:
"""
This method is used to expand numerical dollar values into spoken words.
"""
match = m.group(1)
parts = match.split(".")
if len(parts) > 2:
return match + " dollars" # Unexpected format
dollars = int(parts[0]) if parts[0] else 0
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars and cents:
dollar_unit = "dollar" if dollars == 1 else "dollars"
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
elif dollars:
dollar_unit = "dollar" if dollars == 1 else "dollars"
return "%s %s" % (dollars, dollar_unit)
elif cents:
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s" % (cents, cent_unit)
else:
return "zero dollars"
def _remove_commas(self, m: str) -> str:
"""
This method is used to remove commas from sentences.
"""
return m.group(1).replace(",", "")
def _expand_decimal_point(self, m: str) -> str:
"""
This method is used to expand '.' into spoken word ' point '.
"""
return m.group(1).replace(".", " point ")
def _expand_ordinal(self, num: str) -> str:
"""
This method is used to expand ordinals such as '1st', '2nd' into spoken words.
"""
ordinal_suffixes = {1: "st", 2: "nd", 3: "rd"}
num = int(num.group(0)[:-2])
if 10 <= num % 100 and num % 100 <= 20:
suffix = "th"
else:
suffix = ordinal_suffixes.get(num % 10, "th")
return self.number_to_words(num) + suffix
def _expand_number(self, m: str) -> str:
"""
This method acts as a preprocessing step for numbers between 1000 and 3000 (same as the original repository,
link :
https://github.com/neonbjb/tortoise-tts/blob/4003544b6ff4b68c09856e04d3eff9da26d023c2/tortoise/utils/tokenizer.py#L86)
"""
num = int(m.group(0))
if num > 1000 and num < 3000:
if num == 2000:
return "two thousand"
elif num > 2000 and num < 2010:
return "two thousand " + self.number_to_words(num % 100)
elif num % 100 == 0:
return self.number_to_words(num // 100) + " hundred"
else:
return self.number_to_words(num)
else:
return self.number_to_words(num)
def normalize_numbers(self, text: str) -> str:
"""
This method is used to normalize numbers within a text such as converting the numbers to words, removing
commas, etc.
"""
text = re.sub(re.compile(r"([0-9][0-9\,]+[0-9])"), self._remove_commas, text)
text = re.sub(re.compile(r"£([0-9\,]*[0-9]+)"), r"\1 pounds", text)
text = re.sub(re.compile(r"\$([0-9\.\,]*[0-9]+)"), self._expand_dollars, text)
text = re.sub(re.compile(r"([0-9]+\.[0-9]+)"), self._expand_decimal_point, text)
text = re.sub(re.compile(r"[0-9]+(st|nd|rd|th)"), self._expand_ordinal, text)
text = re.sub(re.compile(r"[0-9]+"), self._expand_number, text)
return text
def expand_abbreviations(self, text: str) -> str:
"""
Expands the abbreviate words.
"""
for regex, replacement in self._abbreviations:
text = re.sub(regex, replacement, text)
return text
def collapse_whitespace(self, text: str) -> str:
"""
Removes multiple whitespaces
"""
return re.sub(re.compile(r"\s+"), " ", text)
def __call__(self, text):
"""
Converts text to ascii, numbers / number-like quantities to their spelt-out counterparts and expands
abbreviations
"""
text = self.convert_to_ascii(text)
text = text.lower()
text = self.normalize_numbers(text)
text = self.expand_abbreviations(text)
text = self.collapse_whitespace(text)
text = text.replace('"', "")
return text
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for CLVP
"""
from ...processing_utils import ProcessorMixin
class ClvpProcessor(ProcessorMixin):
r"""
Constructs a CLVP processor which wraps a CLVP Feature Extractor and a CLVP Tokenizer into a single processor.
[`ClvpProcessor`] offers all the functionalities of [`ClvpFeatureExtractor`] and [`ClvpTokenizer`]. See the
[`~ClvpProcessor.__call__`], [`~ClvpProcessor.decode`] and [`~ClvpProcessor.batch_decode`] for more information.
Args:
feature_extractor (`ClvpFeatureExtractor`):
An instance of [`ClvpFeatureExtractor`]. The feature extractor is a required input.
tokenizer (`ClvpTokenizer`):
An instance of [`ClvpTokenizer`]. The tokenizer is a required input.
"""
feature_extractor_class = "ClvpFeatureExtractor"
tokenizer_class = "ClvpTokenizer"
model_input_names = [
"input_ids",
"input_features",
"attention_mask",
]
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
def __call__(self, *args, **kwargs):
"""
Forwards the `audio` and `sampling_rate` arguments to [`~ClvpFeatureExtractor.__call__`] and the `text`
argument to [`~ClvpTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
information.
"""
raw_speech = kwargs.pop("raw_speech", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None)
if raw_speech is None and text is None:
raise ValueError("You need to specify either an `raw_speech` or `text` input to process.")
if raw_speech is not None:
inputs = self.feature_extractor(raw_speech, sampling_rate=sampling_rate, **kwargs)
if text is not None:
encodings = self.tokenizer(text, **kwargs)
if text is None:
return inputs
elif raw_speech is None:
return encodings
else:
inputs["input_ids"] = encodings["input_ids"]
inputs["attention_mask"] = encodings["attention_mask"]
return inputs
# Copied from transformers.models.whisper.processing_whisper.WhisperProcessor.batch_decode with Whisper->Clvp
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to ClvpTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
# Copied from transformers.models.whisper.processing_whisper.WhisperProcessor.decode with Whisper->Clvp
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to ClvpTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization class for CLVP."""
import json
import os
from functools import lru_cache
from typing import List, Optional, Tuple
import regex as re
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging
from .number_normalizer import EnglishNormalizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"clvp_dev": "https://huggingface.co/susnato/clvp_dev/blob/main/vocab.json",
},
"merges_file": {
"clvp_dev": "https://huggingface.co/susnato/clvp_dev/blob/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"clvp_dev": 1024,
}
@lru_cache()
# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
def get_pairs(word):
"""
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class ClvpTokenizer(PreTrainedTokenizer):
"""
Construct a CLVP tokenizer. Based on byte-level Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
```python
>>> from transformers import ClvpTokenizer
>>> tokenizer = ClvpTokenizer.from_pretrained("susnato/clvp_dev")
>>> tokenizer("Hello world")["input_ids"]
[62, 84, 28, 2, 179, 79]
>>> tokenizer(" Hello world")["input_ids"]
[2, 62, 84, 28, 2, 179, 79]
```
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
<Tip>
When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
</Tip>
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
Path to the vocabulary file.
merges_file (`str`):
Path to the merges file.
errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `"[STOP]"`):
The end of sequence token.
pad_token (`str`, *optional*, defaults to `"[STOP]"`):
The pad token of the sequence.
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (CLVP tokenizer detect beginning of words by the preceding space).
add_bos_token (`bool`, *optional*, defaults to `False`):
Whether to add `bos_token` in front of the sequence when add_special_tokens=True.
add_eos_token (`bool`, *optional*, defaults to `False`):
Whether to add `eos_token` in end of the sequence when add_special_tokens=True.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = [
"input_ids",
"attention_mask",
]
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
unk_token="[UNK]",
bos_token="<|endoftext|>",
eos_token="[STOP]",
pad_token="[STOP]",
add_prefix_space=False,
add_bos_token=False,
add_eos_token=False,
**kwargs,
):
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
self._normalizer = None
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.add_prefix_space = add_prefix_space
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
super().__init__(
errors=errors,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
add_prefix_space=add_prefix_space,
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
**kwargs,
)
@property
def vocab_size(self):
return len(self.encoder)
@property
def normalizer(self):
if self._normalizer is None:
self._normalizer = EnglishNormalizer()
return self._normalizer
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = bos_token_id + token_ids_0 + eos_token_id
if token_ids_1 is not None:
output = output + bos_token_id + token_ids_1 + eos_token_id
return output
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_special_tokens_mask
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
if not self.add_bos_token:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False
)
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0))
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
text = self.normalizer(text)
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
# if the token is "Ġ" we replace it with "[SPACE]" (if "[SPACE]" is present in the vocab), otherwise we keep the "Ġ".
bpe_tokens.extend(
"[SPACE]" if bpe_token == "\u0120" and "[SPACE]" in self.encoder.keys() else bpe_token
for bpe_token in self.bpe(token).split(" ")
)
return bpe_tokens
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def clean_up_tokenization(self, text):
text = "".join(text)
vocab_tokens = list(self.encoder.keys()) + list(self.added_tokens_encoder.keys())
text = text.replace("[SPACE]", " ") if "[SPACE]" in vocab_tokens else text
text = text.replace("[STOP]", " ") if "[STOP]" in vocab_tokens else text
text = text.replace(self.unk_token, "").replace(" ", " ").replace(" ", " ")
return text
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
......@@ -1940,6 +1940,51 @@ class CLIPSegVisionModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
CLVP_PRETRAINED_MODEL_ARCHIVE_LIST = None
class ClvpDecoder(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class ClvpEncoder(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class ClvpForCausalLM(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class ClvpModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class ClvpModelForConditionalGeneration(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class ClvpPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
# coding=utf-8
# Copyright 2023 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import itertools
import os
import random
import tempfile
import unittest
import numpy as np
from datasets import Audio, load_dataset
from transformers import ClvpFeatureExtractor
from transformers.testing_utils import check_json_file_has_correct_format, require_torch, slow
from transformers.utils.import_utils import is_torch_available
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
if is_torch_available():
import torch
global_rng = random.Random()
# Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.floats_list
def floats_list(shape, scale=1.0, rng=None, name=None):
"""Creates a random float32 tensor"""
if rng is None:
rng = global_rng
values = []
for batch_idx in range(shape[0]):
values.append([])
for _ in range(shape[1]):
values[-1].append(rng.random() * scale)
return values
@require_torch
class ClvpFeatureExtractionTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
min_seq_length=400,
max_seq_length=2000,
feature_size=10,
hop_length=160,
chunk_length=8,
padding_value=0.0,
sampling_rate=4_000,
return_attention_mask=False,
):
self.parent = parent
self.batch_size = batch_size
self.min_seq_length = min_seq_length
self.max_seq_length = max_seq_length
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
self.padding_value = padding_value
self.sampling_rate = sampling_rate
self.return_attention_mask = return_attention_mask
self.feature_size = feature_size
self.chunk_length = chunk_length
self.hop_length = hop_length
def prepare_feat_extract_dict(self):
return {
"feature_size": self.feature_size,
"hop_length": self.hop_length,
"chunk_length": self.chunk_length,
"padding_value": self.padding_value,
"sampling_rate": self.sampling_rate,
"return_attention_mask": self.return_attention_mask,
}
# Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester.prepare_inputs_for_common
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
def _flatten(list_of_lists):
return list(itertools.chain(*list_of_lists))
if equal_length:
speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
else:
# make sure that inputs increase in size
speech_inputs = [
floats_list((x, self.feature_size))
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
]
if numpify:
speech_inputs = [np.asarray(x) for x in speech_inputs]
return speech_inputs
@require_torch
class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
feature_extraction_class = ClvpFeatureExtractor
def setUp(self):
self.feat_extract_tester = ClvpFeatureExtractionTester(self)
def tearDown(self):
super().tearDown()
# clean-up as much as possible GPU memory occupied by PyTorch
gc.collect()
torch.cuda.empty_cache()
# Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_from_and_save_pretrained
def test_feat_extract_from_and_save_pretrained(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
mel_1 = feat_extract_first.mel_filters
mel_2 = feat_extract_second.mel_filters
self.assertTrue(np.allclose(mel_1, mel_2))
self.assertEqual(dict_first, dict_second)
# Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_to_json_file
def test_feat_extract_to_json_file(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
json_file_path = os.path.join(tmpdirname, "feat_extract.json")
feat_extract_first.to_json_file(json_file_path)
feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
mel_1 = feat_extract_first.mel_filters
mel_2 = feat_extract_second.mel_filters
self.assertTrue(np.allclose(mel_1, mel_2))
self.assertEqual(dict_first, dict_second)
def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
# create three inputs of length 800, 1000, and 1200
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
# Test feature size
input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
self.assertTrue(input_features.ndim == 3)
self.assertTrue(input_features.shape[-2] == feature_extractor.feature_size)
# Test not batched input
encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
# Test batched
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
# Test 2-D numpy arrays are batched.
speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
np_speech_inputs = np.asarray(speech_inputs)
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
# Test truncation required
speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples + 500), 200)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
speech_inputs_truncated = [x[: feature_extractor.n_samples] for x in speech_inputs]
np_speech_inputs_truncated = [np.asarray(speech_input) for speech_input in speech_inputs_truncated]
encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
encoded_sequences_2 = feature_extractor(np_speech_inputs_truncated, return_tensors="np").input_features
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
# Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad
def test_double_precision_pad(self):
import torch
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
py_speech_inputs = np_speech_inputs.tolist()
for inputs in [py_speech_inputs, np_speech_inputs]:
np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
self.assertTrue(np_processed.input_features.dtype == np.float32)
pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
self.assertTrue(pt_processed.input_features.dtype == torch.float32)
def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", Audio(sampling_rate=22050))
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
@slow
def test_integration(self):
# fmt: off
EXPECTED_INPUT_FEATURES = torch.tensor(
[
0.9271, 1.1405, 1.4419, 1.2470, 1.2438, 1.1787, 1.0595, 1.0570, 1.1070,
1.2205, 1.2376, 1.2997, 1.1131, 1.0843, 1.0459, 1.1858, 1.2323, 1.3582,
1.3401, 1.3770, 1.4173, 1.3381, 1.2291, 1.0854, 1.2116, 1.1873, 1.2178,
1.2137, 1.3001, 1.4274
]
)
# fmt: on
input_speech, sr = self._load_datasamples(1)
feature_extractor = ClvpFeatureExtractor.from_pretrained("susnato/clvp_dev")
input_features = feature_extractor(input_speech, sampling_rate=sr[0], return_tensors="pt").input_features
self.assertEqual(input_features.shape, (1, 80, 517))
self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch Clvp model. """
import gc
import tempfile
import unittest
import datasets
import numpy as np
from transformers import ClvpConfig, ClvpDecoderConfig, ClvpEncoderConfig
from transformers.testing_utils import (
require_torch,
slow,
torch_device,
)
from transformers.utils import is_torch_available
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
_config_zero_init,
ids_tensor,
random_attention_mask,
)
if is_torch_available():
import torch
from transformers import ClvpEncoder, ClvpForCausalLM, ClvpModel, ClvpModelForConditionalGeneration
from transformers.models.clvp.modeling_clvp import CLVP_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers import ClvpFeatureExtractor, ClvpTokenizer
class ClvpEncoderTester:
def __init__(
self,
parent,
batch_size=2,
seq_length=7,
is_training=False,
use_input_mask=True,
use_labels=True,
vocab_size=50,
hidden_size=128,
projection_dim=16,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=32,
dropout=0.1,
attention_dropout=0.1,
initializer_range=0.02,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.dropout = dropout
self.attention_dropout = attention_dropout
self.initializer_range = initializer_range
self.scope = scope
self.bos_token_id = vocab_size - 1
self.eos_token_id = vocab_size - 1
def get_config(self):
encoder_config = ClvpEncoderConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
projection_dim=self.projection_dim,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
initializer_range=self.initializer_range,
bos_token_id=self.bos_token_id,
eos_token_id=self.eos_token_id,
)
return encoder_config
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
if input_mask is not None:
batch_size, seq_length = input_mask.shape
rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
for batch_idx, start_index in enumerate(rnd_start_indices):
input_mask[batch_idx, :start_index] = 1
input_mask[batch_idx, start_index:] = 0
encoder_config = self.get_config()
return encoder_config, input_ids, input_mask
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
speech_config, input_ids, input_mask = config_and_inputs
inputs_dict = {"input_ids": input_ids.to(torch_device), "attention_mask": input_mask.to(torch_device)}
return speech_config, inputs_dict
def create_and_check_model(self, speech_config, input_ids, input_mask):
text_config = ClvpEncoderConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
projection_dim=self.projection_dim,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
initializer_range=self.initializer_range,
)
text_encoder_model = ClvpEncoder(config=text_config)
text_encoder_model.to(torch_device)
text_encoder_model.eval()
with torch.no_grad():
result = text_encoder_model(input_ids, attention_mask=input_mask)
result = text_encoder_model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(result[0].shape, (self.batch_size, self.projection_dim))
# now check with speech config
speech_encoder_model = ClvpEncoder(config=speech_config)
speech_encoder_model.to(torch_device)
speech_encoder_model.eval()
with torch.no_grad():
result = speech_encoder_model(input_ids, attention_mask=input_mask)
result = speech_encoder_model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(result[0].shape, (self.batch_size, self.projection_dim))
@require_torch
class ClvpEncoderTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (ClvpEncoder,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
test_torchscript = False
def setUp(self):
self.model_tester = ClvpEncoderTester(self)
self.encoder_config_tester = ConfigTester(self, config_class=ClvpEncoderConfig, hidden_size=32)
def tearDown(self):
super().tearDown()
# clean-up as much as possible GPU memory occupied by PyTorch
gc.collect()
torch.cuda.empty_cache()
def test_config(self):
self.encoder_config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="ClvpEncoder does not output loss")
def test_training(self):
pass
@unittest.skip(reason="ClvpEncoder does not output loss")
def test_training_gradient_checkpointing(self):
pass
class ClvpDecoderTester:
def __init__(
self,
parent,
batch_size=2,
seq_length=3,
is_training=False,
vocab_size=300,
max_position_embeddings=256,
max_text_tokens=256,
use_input_mask=True,
hidden_size=128,
num_hidden_layers=2,
num_attention_heads=2,
bos_token_id=97,
eos_token_id=98,
relative_attention_num_buckets=4,
relative_attention_max_distance=16,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.max_text_tokens = max_text_tokens
self.use_input_mask = use_input_mask
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
self.num_hidden_layers = num_hidden_layers
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.relative_attention_num_buckets = relative_attention_num_buckets
self.relative_attention_max_distance = relative_attention_max_distance
def get_config(self):
decoder_config = ClvpDecoderConfig(
vocab_size=self.vocab_size,
max_position_embeddings=self.max_position_embeddings,
max_text_tokens=self.max_text_tokens,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
bos_token_id=self.bos_token_id,
eos_token_id=self.eos_token_id,
relative_attention_num_buckets=self.relative_attention_num_buckets,
relative_attention_max_distance=self.relative_attention_max_distance,
)
return decoder_config
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
if input_mask is not None:
batch_size, seq_length = input_mask.shape
rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
for batch_idx, start_index in enumerate(rnd_start_indices):
input_mask[batch_idx, :start_index] = 1
input_mask[batch_idx, start_index:] = 0
decoder_config = self.get_config()
return decoder_config, input_ids, input_mask
def create_and_check_model(self, config, input_ids, attention_mask):
model = ClvpForCausalLM(config).to(torch_device).eval()
with torch.no_grad():
result = model(input_ids=input_ids, attention_mask=attention_mask)
self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.vocab_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, attention_mask = config_and_inputs
inputs_dict = {
"input_ids": input_ids.to(torch_device),
"attention_mask": attention_mask.to(torch_device),
}
return config, inputs_dict
@require_torch
class ClvpDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
all_model_classes = (ClvpModel, ClvpForCausalLM) if is_torch_available() else ()
all_generative_model_classes = (ClvpForCausalLM,) if is_torch_available() else ()
test_pruning = False
def setUp(self):
self.model_tester = ClvpDecoderTester(self)
self.decoder_config_tester = ConfigTester(self, config_class=ClvpDecoderConfig, hidden_size=32)
def tearDown(self):
super().tearDown()
# clean-up as much as possible GPU memory occupied by PyTorch
gc.collect()
torch.cuda.empty_cache()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
if return_labels and model_class == ClvpForCausalLM:
inputs_dict["labels"] = torch.zeros(
[self.model_tester.batch_size, self.model_tester.seq_length], device=torch_device
).long()
return inputs_dict
def test_training(self):
# we will only test the ClvpForCausalLM since it outputs loss
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
model = ClvpForCausalLM(config)
model.to(torch_device)
model.train()
inputs = self._prepare_for_class(inputs_dict, ClvpForCausalLM, return_labels=True)
loss = model(**inputs).loss
loss.backward()
def test_training_gradient_checkpointing(self):
# we will only test the ClvpForCausalLM since it outputs loss
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.use_cache = False
config.return_dict = True
model = ClvpForCausalLM(config)
model.to(torch_device)
model.gradient_checkpointing_enable()
model.train()
inputs = self._prepare_for_class(inputs_dict, ClvpForCausalLM, return_labels=True)
loss = model(**inputs).loss
loss.backward()
class ClvpModelForConditionalGenerationTester:
def __init__(self, parent, is_training=False):
self.parent = parent
self.clvp_encoder_tester = ClvpEncoderTester(parent)
self.is_training = is_training
def get_config(self):
decoder_config = ClvpDecoderConfig(
vocab_size=50,
max_position_embeddings=30,
max_text_tokens=30,
hidden_size=128,
num_hidden_layers=1,
num_attention_heads=2,
bos_token_id=97,
eos_token_id=98,
relative_attention_num_buckets=4,
relative_attention_max_distance=16,
)
text_config = self.clvp_encoder_tester.get_config()
speech_config = self.clvp_encoder_tester.get_config()
speech_config.vocab_size = 300
return ClvpConfig.from_sub_model_configs(
text_config,
speech_config,
decoder_config,
projection_dim=16,
)
def prepare_config_and_inputs(self):
_, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs()
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
_, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
feature_extractor = ClvpFeatureExtractor()
input_features = feature_extractor(raw_speech=audio, sampling_rate=sr, return_tensors="pt")[
"input_features"
].to(torch_device)
config = self.get_config()
return config, input_ids, attention_mask, input_features
def create_and_check_model(self, config, input_ids, attention_mask, input_features):
model = ClvpModelForConditionalGeneration(config).to(torch_device).eval()
with torch.no_grad():
result = model(input_ids=input_ids, input_features=input_features, attention_mask=attention_mask)
self.parent.assertEqual(result.logits_per_speech.shape, (2, self.clvp_encoder_tester.batch_size))
self.parent.assertEqual(result.logits_per_text.shape, (self.clvp_encoder_tester.batch_size, 2))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, attention_mask, input_features = config_and_inputs
inputs_dict = {
"input_ids": input_ids.to(torch_device),
"attention_mask": attention_mask.to(torch_device),
"input_features": input_features.to(torch_device),
"return_loss": False,
}
return config, inputs_dict
@require_torch
class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (ClvpModelForConditionalGeneration,) if is_torch_available() else ()
test_head_masking = False
test_pruning = False
test_resize_embeddings = False
test_attention_outputs = False
test_torchscript = False
def setUp(self):
self.model_tester = ClvpModelForConditionalGenerationTester(self)
self.clvp_config_tester = ConfigTester(self, config_class=ClvpConfig, hidden_size=32)
def tearDown(self):
super().tearDown()
# clean-up as much as possible GPU memory occupied by PyTorch
gc.collect()
torch.cuda.empty_cache()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
# check for decoder model, text encoder model and speech encoder model hidden states
decoder_hidden_states = outputs.decoder_hidden_states
text_encoder_hidden_states = outputs.text_encoder_hidden_states
speech_encoder_hidden_states = outputs.speech_encoder_hidden_states
# check length of the hidden states
expected_decoder_num_layers = config.decoder_config.num_hidden_layers + 1
self.assertEqual(len(decoder_hidden_states), expected_decoder_num_layers)
expected_speech_encoder_num_layers = config.text_config.num_hidden_layers + 1
self.assertEqual(len(text_encoder_hidden_states), expected_speech_encoder_num_layers)
expected_text_encoder_num_layers = config.speech_config.num_hidden_layers + 1
self.assertEqual(len(speech_encoder_hidden_states), expected_text_encoder_num_layers)
# check shapes of each hidden state
# for the decoder model we will only test the dimension because the ClvpConditioningEncoder could increase
# the sequence lengths.
self.assertEqual(decoder_hidden_states[0].shape[-1], config.decoder_config.hidden_size)
# the testing for text encoder stays standard because we just pass the text tokens here.
self.assertListEqual(
list(text_encoder_hidden_states[0].shape[-2:]),
[self.model_tester.clvp_encoder_tester.seq_length, config.text_config.hidden_size],
)
# for the decoder model we will only test the dimension because the fix_decoder_outputs method could increase
# the sequence lengths by adding `decoder_fixing_codes` tokens at the end.
self.assertEqual(speech_encoder_hidden_states[0].shape[-1], config.speech_config.hidden_size)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
@unittest.skip(reason="Retain_grad is tested in individual model tests")
def test_retain_grad_hidden_states_attentions(self):
pass
@unittest.skip(reason="ClvpModelForConditionalGeneration does not have get_input_embeddings")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="ClvpModelForConditionalGeneration does not have get_input_embeddings")
def test_model_common_attributes(self):
pass
# override as the `logit_scale` parameter initilization is different for Clvp
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
# check if `logit_scale` is initilized as per the original implementation
if name == "logit_scale":
expected_value = np.log(1 / 0.07)
returned_value = param.data.item()
self.assertAlmostEqual(
returned_value,
expected_value,
delta=1e-3,
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
else:
expected_range = [0.0, 1.0]
returned_range = ((param.data.mean() * 1e9).round() / 1e9).item()
self.assertIn(
returned_range,
expected_range,
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
def test_load_speech_text_decoder_config(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# Save ClvpConfig and check if we can load ClvpEncoderConfig from it
with tempfile.TemporaryDirectory() as tmp_dir_name:
config.save_pretrained(tmp_dir_name)
encoder_config = ClvpEncoderConfig.from_pretrained(tmp_dir_name)
self.assertDictEqual(config.text_config.to_dict(), encoder_config.to_dict())
# Save ClvpConfig and check if we can load ClvpDecoderConfig from it
with tempfile.TemporaryDirectory() as tmp_dir_name:
config.save_pretrained(tmp_dir_name)
decoder_config = ClvpDecoderConfig.from_pretrained(tmp_dir_name)
self.assertDictEqual(config.decoder_config.to_dict(), decoder_config.to_dict())
@slow
def test_model_from_pretrained(self):
for model_name in CLVP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = ClvpModelForConditionalGeneration.from_pretrained(model_name)
self.assertIsNotNone(model)
# Since Clvp has a lot of different models connected with each other it's better to test each of them individually along
# with a test_full_model_integration. If the model breaks in future, it could be of a great help to identify the broken part.
@slow
@require_torch
class ClvpIntegrationTest(unittest.TestCase):
def setUp(self):
self.text = "This is an example text."
ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
_, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
self.model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev").to(torch_device)
self.model.eval()
tokenizer = ClvpTokenizer.from_pretrained("susnato/clvp_dev")
feature_extractor = ClvpFeatureExtractor.from_pretrained("susnato/clvp_dev")
tokenizer_output = tokenizer(self.text, return_tensors="pt")
self.text_tokens = tokenizer_output["input_ids"].to(torch_device)
self.input_features = feature_extractor(
raw_speech=self.speech_samples, sampling_rate=self.sr, return_tensors="pt"
)["input_features"].to(torch_device)
def tearDown(self):
super().tearDown()
# clean-up as much as possible GPU memory occupied by PyTorch
gc.collect()
torch.cuda.empty_cache()
def test_conditional_encoder(self):
with torch.no_grad():
conditioning_encoder_outputs = self.model.conditioning_encoder(
input_features=self.input_features, input_ids=self.text_tokens
).to("cpu")
self.assertEqual(
conditioning_encoder_outputs.shape,
torch.Size((self.input_features.shape[0], 18, self.model.config.decoder_config.hidden_size)),
)
EXPECTED_OUTPUTS = torch.tensor(
[[-0.8582, 0.5228, 1.9944], [-0.0465, -1.1017, -0.0093], [-0.0466, -0.6030, -0.1280]]
)
self.assertTrue(torch.allclose(conditioning_encoder_outputs[0, :3, :3], EXPECTED_OUTPUTS, atol=1e-4))
def test_decoder_model_generate(self):
autoregressive_model_output = self.model.speech_decoder_model.generate(input_ids=self.text_tokens).cpu()
EXPECTED_OUTPUTS = torch.tensor([[147, 2, 54, 2, 43, 2, 169, 122, 29, 64, 2, 136, 37, 33, 9, 8193]])
self.assertTrue(torch.allclose(autoregressive_model_output, EXPECTED_OUTPUTS))
def test_text_and_speech_encoder_models(self):
# check for text embeds
text_embeds = self.model.text_encoder_model(input_ids=self.text_tokens, return_dict=True)[0].cpu()
# fmt: off
EXPECTED_TEXT_EMBEDS = torch.tensor(
[ 1.8060e+00, -2.7928e+00, 3.2021e+00, -1.5673e+00, 2.3284e+00, -3.2065e+00, -1.3368e+00, 2.2322e+00,
-1.7667e+00, 4.1505e-01, 2.4119e+00, -5.8133e-03, -4.6367e+00, 1.6450e-01, 6.7459e+00, 6.6292e+00,
1.1046e+00, 3.6196e+00, -1.0496e+01, 5.4924e+00
]
)
# fmt: on
self.assertTrue(torch.allclose(text_embeds[0, :20], EXPECTED_TEXT_EMBEDS, atol=1e-4))
# check for speech embeds
speech_embeds = self.model.speech_encoder_model(input_ids=self.text_tokens, return_dict=True)[0].cpu()
# fmt: off
EXPECTED_SPEECH_EMBEDS = torch.tensor(
[ 4.6143, -5.5784, 0.8983, -3.9665, -0.6714, -1.0665, -1.1277, 1.5619, 2.6322, -7.2008, -2.4932, 0.3265,
-1.4738, 0.1425, 5.0825, 4.1760, -5.4708, 2.1935, -6.0044, 3.9540
]
)
# fmt: on
self.assertTrue(torch.allclose(speech_embeds[0, :20], EXPECTED_SPEECH_EMBEDS, atol=1e-4))
def test_full_model_integration(self):
full_model_output = self.model.generate(
input_ids=self.text_tokens,
input_features=self.input_features,
do_sample=False,
num_beams=4,
num_return_sequences=4,
max_new_tokens=10,
).speech_ids.cpu()
EXPECTED_OUTPUTS = torch.tensor([[1953, 1080, 612], [1953, 1953, 612], [1953, 612, 716]])
self.assertTrue(torch.allclose(full_model_output[-3:, -3:], EXPECTED_OUTPUTS))
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import shutil
import tempfile
import unittest
from transformers import ClvpFeatureExtractor, ClvpProcessor, ClvpTokenizer
from transformers.testing_utils import require_torch
from .test_feature_extraction_clvp import floats_list
@require_torch
class ClvpProcessorTest(unittest.TestCase):
def setUp(self):
self.checkpoint = "susnato/clvp_dev"
self.tmpdirname = tempfile.mkdtemp()
def tearDown(self):
super().tearDown()
shutil.rmtree(self.tmpdirname)
gc.collect()
# Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.get_tokenizer with Whisper->Clvp
def get_tokenizer(self, **kwargs):
return ClvpTokenizer.from_pretrained(self.checkpoint, **kwargs)
# Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.get_feature_extractor with Whisper->Clvp
def get_feature_extractor(self, **kwargs):
return ClvpFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
# Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_save_load_pretrained_default with Whisper->Clvp
def test_save_load_pretrained_default(self):
tokenizer = self.get_tokenizer()
feature_extractor = self.get_feature_extractor()
processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
processor.save_pretrained(self.tmpdirname)
processor = ClvpProcessor.from_pretrained(self.tmpdirname)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.tokenizer, ClvpTokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor.feature_extractor, ClvpFeatureExtractor)
# Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_feature_extractor with Whisper->Clvp,processor(raw_speech->processor(raw_speech=raw_speech
def test_feature_extractor(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
raw_speech = floats_list((3, 1000))
input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
input_processor = processor(raw_speech=raw_speech, return_tensors="np")
for key in input_feat_extract.keys():
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
# Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_tokenizer with Whisper->Clvp
def test_tokenizer(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
input_str = "This is a test string"
encoded_processor = processor(text=input_str)
encoded_tok = tokenizer(input_str)
for key in encoded_tok.keys():
self.assertListEqual(encoded_tok[key], encoded_processor[key])
# Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_tokenizer_decode with Whisper->Clvp
def test_tokenizer_decode(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
decoded_processor = processor.batch_decode(predicted_ids)
decoded_tok = tokenizer.batch_decode(predicted_ids)
self.assertListEqual(decoded_tok, decoded_processor)
def test_save_load_pretrained_additional_features(self):
processor = ClvpProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(pad_token="(PAD)")
feature_extractor_add_kwargs = self.get_feature_extractor(sampling_rate=16000)
processor = ClvpProcessor.from_pretrained(
self.tmpdirname,
pad_token="(PAD)",
sampling_rate=16000,
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, ClvpTokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, ClvpFeatureExtractor)
def test_model_input_names(self):
feature_extractor = self.get_feature_extractor()
tokenizer = self.get_tokenizer()
processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
self.assertListEqual(
sorted(processor.model_input_names),
sorted(set(feature_extractor.model_input_names + tokenizer.model_input_names)),
msg="`processor` and `feature_extractor` model input names do not match",
)
# coding=utf-8
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import unittest
from typing import List
from transformers import ClvpTokenizer
from ...test_tokenization_common import TokenizerTesterMixin, slow
class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = ClvpTokenizer
test_rust_tokenizer = False
from_pretrained_kwargs = {"add_prefix_space": True}
test_seq2seq = False
test_sentencepiece_ignore_case = True
def setUp(self):
super().setUp()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [
"l",
"o",
"w",
"e",
"r",
"s",
"t",
"i",
"d",
"n",
"\u0120",
"\u0120l",
"\u0120n",
"\u0120lo",
"\u0120low",
"er",
"\u0120lowest",
"\u0120newer",
"\u0120wider",
"<unk>",
"<|endoftext|>",
"[SPACE]",
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, "vocab.json")
self.merges_file = os.path.join(self.tmpdirname, "merges.txt")
with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_tokenizer with GPT2->Clvp
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return ClvpTokenizer.from_pretrained(self.tmpdirname, **kwargs)
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
output_text = "lower newer"
return input_text, output_text
# Copied from transformers.tests.models.layoutxlm.test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_add_special_tokens
def test_add_special_tokens(self):
tokenizers: List[ClvpTokenizer] = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
special_token = "[SPECIAL_TOKEN]"
special_token_box = [1000, 1000, 1000, 1000]
tokenizer.add_special_tokens({"cls_token": special_token})
encoded_special_token = tokenizer.encode(
[special_token], boxes=[special_token_box], add_special_tokens=False
)
self.assertEqual(len(encoded_special_token), 1)
decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
self.assertTrue(special_token not in decoded)
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_rust_and_python_full_tokenizers
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
sequence = "lower newer"
# Testing tokenization
tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
# Testing conversion to ids without special tokens
ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
# Testing conversion to ids with special tokens
rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
ids = tokenizer.encode(sequence, add_prefix_space=True)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
# Testing the unknown token
input_tokens = tokens + [rust_tokenizer.unk_token]
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_padding
def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Simple input
s = "This is a simple input"
s2 = ["This is a simple input 1", "This is a simple input 2"]
p = ("This is a simple input", "This is a pair")
p2 = [
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
]
# Simple input tests
self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
s2,
max_length=max_length,
padding="max_length",
)
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
p2,
max_length=max_length,
padding="max_length",
)
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_padding_if_pad_token_set_slow
def test_padding_if_pad_token_set_slow(self):
tokenizer = ClvpTokenizer.from_pretrained(self.tmpdirname, pad_token="<pad>")
# Simple input
s = "This is a simple input"
s2 = ["This is a simple input looooooooong", "This is a simple input"]
p = ("This is a simple input", "This is a pair")
p2 = [
("This is a simple input loooooong", "This is a simple input"),
("This is a simple pair loooooong", "This is a simple pair"),
]
pad_token_id = tokenizer.pad_token_id
out_s = tokenizer(s, padding="max_length", max_length=30, return_tensors="np")
out_s2 = tokenizer(s2, padding=True, truncate=True, return_tensors="np")
out_p = tokenizer(*p, padding="max_length", max_length=60, return_tensors="np")
out_p2 = tokenizer(p2, padding=True, truncate=True, return_tensors="np")
# s
# test single string max_length padding
self.assertEqual(out_s["input_ids"].shape[-1], 30)
self.assertTrue(pad_token_id in out_s["input_ids"])
self.assertTrue(0 in out_s["attention_mask"])
# s2
# test automatic padding
self.assertEqual(out_s2["input_ids"].shape[-1], 33)
# long slice doesn't have padding
self.assertFalse(pad_token_id in out_s2["input_ids"][0])
self.assertFalse(0 in out_s2["attention_mask"][0])
# short slice does have padding
self.assertTrue(pad_token_id in out_s2["input_ids"][1])
self.assertTrue(0 in out_s2["attention_mask"][1])
# p
# test single pair max_length padding
self.assertEqual(out_p["input_ids"].shape[-1], 60)
self.assertTrue(pad_token_id in out_p["input_ids"])
self.assertTrue(0 in out_p["attention_mask"])
# p2
# test automatic padding pair
self.assertEqual(out_p2["input_ids"].shape[-1], 52)
# long slice pair doesn't have padding
self.assertFalse(pad_token_id in out_p2["input_ids"][0])
self.assertFalse(0 in out_p2["attention_mask"][0])
# short slice pair does have padding
self.assertTrue(pad_token_id in out_p2["input_ids"][1])
self.assertTrue(0 in out_p2["attention_mask"][1])
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_special_tokens_mask_input_pairs_and_bos_token
def test_special_tokens_mask_input_pairs_and_bos_token(self):
# TODO: change to self.get_tokenizers() when the fast version is implemented
tokenizers = [self.get_tokenizer(do_lower_case=False, add_bos_token=True)]
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
sequence_0 = "Encode this."
sequence_1 = "This one too please."
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False)
encoded_sequence_dict = tokenizer.encode_plus(
sequence_0,
sequence_1,
add_special_tokens=True,
return_special_tokens_mask=True,
)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
filtered_sequence = [
(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
]
filtered_sequence = [x for x in filtered_sequence if x is not None]
self.assertEqual(encoded_sequence, filtered_sequence)
def test_token_type_ids(self):
tokenizer = self.get_tokenizer()
seq_0 = "Test this method."
# We want to have sequence 0 and sequence 1 are tagged
# respectively with 0 and 1 token_ids
# (regardless of whether the model use token type ids)
# We use this assumption in the QA pipeline among other place
output = tokenizer(seq_0, return_token_type_ids=True, add_special_tokens=True)
self.assertIn(0, output["token_type_ids"])
def test_full_tokenizer(self):
tokenizer = ClvpTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "lower newer"
bpe_tokens = ["l", "o", "w", "er", "[SPACE]", "n", "e", "w", "er"]
tokens = tokenizer.tokenize(text, add_prefix_space=False)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [0, 1, 2, 15, 21, 9, 3, 2, 15, 19]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
@slow
def test_outputs_with_numbers(self):
text = "hello and this is an example text and I have $1000. my lucky number is 12345."
tokenizer = ClvpTokenizer.from_pretrained("susnato/clvp_dev")
# fmt: off
EXPECTED_OUTPUT = [62, 84, 28, 2, 53, 2,147, 2, 54, 2, 43, 2, 169, 122, 29, 64, 2, 136, 37, 33, 2, 53, 2, 22,
2, 148, 2, 110, 2, 40, 206, 53, 2, 134, 84, 59, 32, 9, 2, 125, 2, 25, 34, 197, 38, 2, 27,
231, 15, 44, 2, 54, 2, 33, 100, 25, 76, 2, 40, 206, 53, 7, 2, 40, 46, 18, 2, 21, 97, 17,
219, 2, 87, 210, 8, 19, 22, 76, 9,
]
# fmt: on
self.assertListEqual(tokenizer.encode(text, add_special_tokens=False), EXPECTED_OUTPUT)
@slow
def test_tokenizer_integration(self):
sequences = [
"Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
"general-purpose architectures (BERT, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
"Language Understanding (NLU) and Natural Language Generation (NLG) with over multiple pretrained "
"models and deep interoperability between Jax, PyTorch and TensorFlow.",
"BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
"conditioning on both left and right context in all layers.",
"The quick brown fox jumps over the lazy dog.",
]
# fmt: off
expected_encoding = {'input_ids': [[144, 43, 32, 87, 26, 173, 2, 5, 87, 26, 44, 70, 2, 209, 27, 2, 55, 2, 29, 38, 51, 31, 71, 8, 144, 43, 32, 87, 26, 173, 2, 53, 2, 29, 38, 51, 31, 71, 8, 29, 46, 144, 137, 49, 8, 15, 44, 33, 6, 2, 187, 35, 83, 61, 2, 20, 50, 44, 56, 8, 29, 121, 139, 66, 2, 59, 71, 60, 18, 16, 33, 34, 175, 2, 5, 15, 44, 33, 7, 2, 89, 15, 44, 33, 14, 7, 2, 37, 25, 26, 7, 2, 17, 54, 78, 25, 15, 44, 33, 7, 2, 37, 25, 111, 33, 9, 9, 9, 6, 2, 87, 2, 27, 48, 121, 56, 2, 25, 43, 20, 34, 14, 112, 2, 97, 234, 63, 53, 52, 2, 5, 27, 25, 34, 6, 2, 53, 2, 27, 48, 121, 56, 2, 25, 43, 20, 34, 14, 112, 2, 20, 50, 44, 158, 2, 5, 27, 25, 20, 6, 2, 103, 2, 253, 2, 26, 167, 78, 29, 64, 2, 29, 46, 144, 137, 49, 2, 115, 126, 25, 32, 2, 53, 2, 126, 18, 29, 2, 41, 114, 161, 44, 109, 151, 240, 2, 67, 33, 100, 50, 2, 23, 14, 37, 7, 2, 29, 38, 51, 31, 71, 2, 53, 2, 33, 50, 32, 57, 19, 25, 69, 9], [ 15, 44, 33, 2, 54, 2, 17, 61, 22, 20, 27, 49, 2, 51, 2, 29, 46, 8, 144, 137, 2, 126, 18, 29, 2, 15, 83, 22, 46, 16, 181, 56, 2, 46, 29, 175, 86, 158, 32, 2, 154, 2, 97, 25, 14, 67, 25, 49, 2, 136, 37, 33, 2, 185, 2, 23, 28, 41, 33, 70, 2, 135, 17, 60, 107, 52, 2, 47, 2, 165, 40, 2, 64, 19, 33, 2, 53, 2, 101, 104, 2, 135, 136, 37, 33, 2, 41, 2, 108, 2, 25, 88, 173, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 42, 2, 194, 91, 24, 2, 243, 190, 2, 182, 37, 2, 23, 231, 29, 32, 2, 253, 2, 42, 2, 25, 14, 39, 38, 2, 134, 20, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # noqa: E501
'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # noqa: E501
}
# fmt: on
self.tokenizer_integration_test_util(
sequences=sequences, expected_encoding=expected_encoding, model_name="susnato/clvp_dev", padding=True
)
......@@ -207,6 +207,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"CLIPTextModelWithProjection",
"CLIPVisionModel",
"CLIPVisionModelWithProjection",
"ClvpForCausalLM",
"ClvpModel",
"GroupViTTextModel",
"GroupViTVisionModel",
"TFCLIPTextModel",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment