Unverified Commit d57ffb48 authored by Joao Gante's avatar Joao Gante Committed by GitHub
Browse files

Generate: remove deprecated public decoding functions and streamline logic 🧼 (#29956)

parent dc401d3a
...@@ -123,7 +123,7 @@ class AssistedCandidateGenerator(CandidateGenerator): ...@@ -123,7 +123,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
inputs_tensor, assistant_model.generation_config.bos_token_id, assistant_kwargs inputs_tensor, assistant_model.generation_config.bos_token_id, assistant_kwargs
) )
assistant_kwargs = assistant_model._prepare_encoder_decoder_kwargs_for_generation( assistant_kwargs = assistant_model._prepare_encoder_decoder_kwargs_for_generation(
inputs_tensor, assistant_kwargs, model_input_name inputs_tensor, assistant_kwargs, model_input_name, assistant_model.generation_config
) )
elif "encoder_outputs" in model_kwargs: elif "encoder_outputs" in model_kwargs:
assistant_kwargs["encoder_outputs"] = model_kwargs["encoder_outputs"] assistant_kwargs["encoder_outputs"] = model_kwargs["encoder_outputs"]
......
...@@ -65,25 +65,16 @@ class GenerationConfig(PushToHubMixin): ...@@ -65,25 +65,16 @@ class GenerationConfig(PushToHubMixin):
Class that holds a configuration for a generation task. A `generate` call supports the following generation methods Class that holds a configuration for a generation task. A `generate` call supports the following generation methods
for text-decoder, text-to-text, speech-to-text, and vision-to-text models: for text-decoder, text-to-text, speech-to-text, and vision-to-text models:
- *greedy decoding* by calling [`~generation.GenerationMixin._greedy_search`] if `num_beams=1` and - *greedy decoding* if `num_beams=1` and `do_sample=False`
`do_sample=False` - *contrastive search* if `penalty_alpha>0.` and `top_k>1`
- *contrastive search* by calling [`~generation.GenerationMixin._contrastive_search`] if `penalty_alpha>0.` - *multinomial sampling* if `num_beams=1` and `do_sample=True`
and `top_k>1` - *beam-search decoding* if `num_beams>1` and `do_sample=False`
- *multinomial sampling* by calling [`~generation.GenerationMixin._sample`] if `num_beams=1` and - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
`do_sample=True` - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1`
- *beam-search decoding* by calling [`~generation.GenerationMixin._beam_search`] if `num_beams>1` and - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None`
`do_sample=False` - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`
- *beam-search multinomial sampling* by calling [`~generation.GenerationMixin._beam_sample`] if
`num_beams>1` and `do_sample=True` To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
- *diverse beam-search decoding* by calling [`~generation.GenerationMixin._group_beam_search`], if
`num_beams>1` and `num_beam_groups>1`
- *constrained beam-search decoding* by calling [`~generation.GenerationMixin._constrained_beam_search`], if
`constraints!=None` or `force_words_ids!=None`
- *assisted decoding* by calling [`~generation.GenerationMixin._assisted_decoding`], if
`assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`
You do not need to call any of the above methods directly. Pass custom parameter values to '.generate()'. To learn
more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
<Tip> <Tip>
......
...@@ -81,7 +81,6 @@ from .stopping_criteria import ( ...@@ -81,7 +81,6 @@ from .stopping_criteria import (
StoppingCriteria, StoppingCriteria,
StoppingCriteriaList, StoppingCriteriaList,
StopStringCriteria, StopStringCriteria,
validate_stopping_criteria,
) )
...@@ -333,25 +332,16 @@ class GenerationMixin: ...@@ -333,25 +332,16 @@ class GenerationMixin:
A class containing all functions for auto-regressive text generation, to be used as a mixin in [`PreTrainedModel`]. A class containing all functions for auto-regressive text generation, to be used as a mixin in [`PreTrainedModel`].
The class exposes [`~generation.GenerationMixin.generate`], which can be used for: The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
- *greedy decoding* by calling [`~generation.GenerationMixin._greedy_search`] if `num_beams=1` and - *greedy decoding* if `num_beams=1` and `do_sample=False`
`do_sample=False` - *contrastive search* if `penalty_alpha>0` and `top_k>1`
- *contrastive search* by calling [`~generation.GenerationMixin._contrastive_search`] if `penalty_alpha>0` and - *multinomial sampling* if `num_beams=1` and `do_sample=True`
`top_k>1` - *beam-search decoding* if `num_beams>1` and `do_sample=False`
- *multinomial sampling* by calling [`~generation.GenerationMixin._sample`] if `num_beams=1` and - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
`do_sample=True` - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1`
- *beam-search decoding* by calling [`~generation.GenerationMixin._beam_search`] if `num_beams>1` and - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None`
`do_sample=False` - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`
- *beam-search multinomial sampling* by calling [`~generation.GenerationMixin._beam_sample`] if `num_beams>1`
and `do_sample=True` To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
- *diverse beam-search decoding* by calling [`~generation.GenerationMixin._group_beam_search`], if `num_beams>1`
and `num_beam_groups>1`
- *constrained beam-search decoding* by calling [`~generation.GenerationMixin._constrained_beam_search`], if
`constraints!=None` or `force_words_ids!=None`
- *assisted decoding* by calling [`~generation.GenerationMixin._assisted_decoding`], if
`assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`
You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
""" """
def prepare_inputs_for_generation(self, *args, **kwargs): def prepare_inputs_for_generation(self, *args, **kwargs):
...@@ -474,7 +464,11 @@ class GenerationMixin: ...@@ -474,7 +464,11 @@ class GenerationMixin:
return torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device) return torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device)
def _prepare_encoder_decoder_kwargs_for_generation( def _prepare_encoder_decoder_kwargs_for_generation(
self, inputs_tensor: torch.Tensor, model_kwargs, model_input_name: Optional[str] = None self,
inputs_tensor: torch.Tensor,
model_kwargs,
model_input_name: Optional[str],
generation_config: GenerationConfig,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
# 1. get encoder # 1. get encoder
encoder = self.get_encoder() encoder = self.get_encoder()
...@@ -486,7 +480,7 @@ class GenerationMixin: ...@@ -486,7 +480,7 @@ class GenerationMixin:
else: else:
add_hook_to_module(encoder, AlignDevicesHook(io_same_device=True)) add_hook_to_module(encoder, AlignDevicesHook(io_same_device=True))
# 2. Prepare encoder args and encoder kwargs from model kwargs. # 2. Prepare encoder args and encoder kwargs from model kwargs and generation config.
irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"] irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"]
encoder_kwargs = { encoder_kwargs = {
argument: value argument: value
...@@ -499,6 +493,8 @@ class GenerationMixin: ...@@ -499,6 +493,8 @@ class GenerationMixin:
encoder_kwargs = { encoder_kwargs = {
argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
} }
encoder_kwargs["output_attentions"] = generation_config.output_attentions
encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states
# 3. make sure that encoder returns `ModelOutput` # 3. make sure that encoder returns `ModelOutput`
model_input_name = model_input_name if model_input_name is not None else self.main_input_name model_input_name = model_input_name if model_input_name is not None else self.main_input_name
...@@ -1374,7 +1370,7 @@ class GenerationMixin: ...@@ -1374,7 +1370,7 @@ class GenerationMixin:
method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs` method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
should be in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of should be in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
`input_ids`, `input_values`, `input_features`, or `pixel_values`. `input_ids`, `input_values`, `input_features`, or `pixel_values`.
generation_config (`~generation.GenerationConfig`, *optional*): generation_config ([`~generation.GenerationConfig`], *optional*):
The generation configuration to be used as base parametrization for the generation call. `**kwargs` The generation configuration to be used as base parametrization for the generation call. `**kwargs`
passed to generate matching the attributes of `generation_config` will override them. If passed to generate matching the attributes of `generation_config` will override them. If
`generation_config` is not provided, the default will be used, which has the following loading `generation_config` is not provided, the default will be used, which has the following loading
...@@ -1475,8 +1471,6 @@ class GenerationMixin: ...@@ -1475,8 +1471,6 @@ class GenerationMixin:
batch_size = inputs_tensor.shape[0] batch_size = inputs_tensor.shape[0]
# 4. Define other model kwargs # 4. Define other model kwargs
model_kwargs["output_attentions"] = generation_config.output_attentions
model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
# decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
# generating the first new token or not, and we only want to use the embeddings for the first new token) # generating the first new token or not, and we only want to use the embeddings for the first new token)
if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds": if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
...@@ -1510,7 +1504,7 @@ class GenerationMixin: ...@@ -1510,7 +1504,7 @@ class GenerationMixin:
# if model is encoder decoder encoder_outputs are created # if model is encoder decoder encoder_outputs are created
# and added to `model_kwargs` # and added to `model_kwargs`
model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation( model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
inputs_tensor, model_kwargs, model_input_name inputs_tensor, model_kwargs, model_input_name, generation_config
) )
# 5. Prepare `input_ids` which will be used for auto-regressive generation # 5. Prepare `input_ids` which will be used for auto-regressive generation
...@@ -1593,6 +1587,7 @@ class GenerationMixin: ...@@ -1593,6 +1587,7 @@ class GenerationMixin:
prepared_stopping_criteria = self._get_stopping_criteria( prepared_stopping_criteria = self._get_stopping_criteria(
generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs
) )
# 10. go into different generation modes # 10. go into different generation modes
if generation_mode == GenerationMode.ASSISTED_GENERATION: if generation_mode == GenerationMode.ASSISTED_GENERATION:
if generation_config.num_return_sequences > 1: if generation_config.num_return_sequences > 1:
...@@ -1617,18 +1612,19 @@ class GenerationMixin: ...@@ -1617,18 +1612,19 @@ class GenerationMixin:
model_kwargs=model_kwargs, model_kwargs=model_kwargs,
) )
# 12. run assisted generate # 12. prepare logits warper (if `do_sample` is `True`)
prepared_logits_warper = (
self._get_logits_warper(generation_config) if generation_config.do_sample else None
)
# 13. run assisted generate
result = self._assisted_decoding( result = self._assisted_decoding(
input_ids, input_ids,
candidate_generator=candidate_generator, candidate_generator=candidate_generator,
do_sample=generation_config.do_sample,
logits_processor=prepared_logits_processor, logits_processor=prepared_logits_processor,
logits_warper=self._get_logits_warper(generation_config) if generation_config.do_sample else None, logits_warper=prepared_logits_warper,
stopping_criteria=prepared_stopping_criteria, stopping_criteria=prepared_stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
output_scores=generation_config.output_scores,
output_logits=generation_config.output_logits,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
streamer=streamer, streamer=streamer,
**model_kwargs, **model_kwargs,
...@@ -1639,10 +1635,7 @@ class GenerationMixin: ...@@ -1639,10 +1635,7 @@ class GenerationMixin:
input_ids, input_ids,
logits_processor=prepared_logits_processor, logits_processor=prepared_logits_processor,
stopping_criteria=prepared_stopping_criteria, stopping_criteria=prepared_stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
output_scores=generation_config.output_scores,
output_logits=generation_config.output_logits,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
streamer=streamer, streamer=streamer,
**model_kwargs, **model_kwargs,
...@@ -1654,17 +1647,11 @@ class GenerationMixin: ...@@ -1654,17 +1647,11 @@ class GenerationMixin:
result = self._contrastive_search( result = self._contrastive_search(
input_ids, input_ids,
top_k=generation_config.top_k,
penalty_alpha=generation_config.penalty_alpha,
logits_processor=prepared_logits_processor, logits_processor=prepared_logits_processor,
stopping_criteria=prepared_stopping_criteria, stopping_criteria=prepared_stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
output_scores=generation_config.output_scores,
output_logits=generation_config.output_logits,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
streamer=streamer, streamer=streamer,
sequential=generation_config.low_memory,
**model_kwargs, **model_kwargs,
) )
...@@ -1686,10 +1673,7 @@ class GenerationMixin: ...@@ -1686,10 +1673,7 @@ class GenerationMixin:
logits_processor=prepared_logits_processor, logits_processor=prepared_logits_processor,
logits_warper=logits_warper, logits_warper=logits_warper,
stopping_criteria=prepared_stopping_criteria, stopping_criteria=prepared_stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
output_scores=generation_config.output_scores,
output_logits=generation_config.output_logits,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
streamer=streamer, streamer=streamer,
**model_kwargs, **model_kwargs,
...@@ -1719,12 +1703,8 @@ class GenerationMixin: ...@@ -1719,12 +1703,8 @@ class GenerationMixin:
beam_scorer, beam_scorer,
logits_processor=prepared_logits_processor, logits_processor=prepared_logits_processor,
stopping_criteria=prepared_stopping_criteria, stopping_criteria=prepared_stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
output_scores=generation_config.output_scores,
output_logits=generation_config.output_logits,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
sequential=generation_config.low_memory,
**model_kwargs, **model_kwargs,
) )
...@@ -1758,10 +1738,7 @@ class GenerationMixin: ...@@ -1758,10 +1738,7 @@ class GenerationMixin:
logits_processor=prepared_logits_processor, logits_processor=prepared_logits_processor,
logits_warper=logits_warper, logits_warper=logits_warper,
stopping_criteria=prepared_stopping_criteria, stopping_criteria=prepared_stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
output_scores=generation_config.output_scores,
output_logits=generation_config.output_logits,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
**model_kwargs, **model_kwargs,
) )
...@@ -1791,10 +1768,7 @@ class GenerationMixin: ...@@ -1791,10 +1768,7 @@ class GenerationMixin:
beam_scorer, beam_scorer,
logits_processor=prepared_logits_processor, logits_processor=prepared_logits_processor,
stopping_criteria=prepared_stopping_criteria, stopping_criteria=prepared_stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
output_scores=generation_config.output_scores,
output_logits=generation_config.output_logits,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
**model_kwargs, **model_kwargs,
) )
...@@ -1864,10 +1838,7 @@ class GenerationMixin: ...@@ -1864,10 +1838,7 @@ class GenerationMixin:
constrained_beam_scorer=constrained_beam_scorer, constrained_beam_scorer=constrained_beam_scorer,
logits_processor=prepared_logits_processor, logits_processor=prepared_logits_processor,
stopping_criteria=prepared_stopping_criteria, stopping_criteria=prepared_stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
output_scores=generation_config.output_scores,
output_logits=generation_config.output_logits,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
**model_kwargs, **model_kwargs,
) )
...@@ -1892,87 +1863,37 @@ class GenerationMixin: ...@@ -1892,87 +1863,37 @@ class GenerationMixin:
return False return False
return True return True
def contrastive_search(self, *args, **kwargs):
logger.warning_once(
"Calling `contrastive_search` directly is deprecated and will be removed in v4.41. Use `generate` or a "
"custom generation loop instead.",
)
return self._contrastive_search(*args, **kwargs)
@torch.no_grad() @torch.no_grad()
def _contrastive_search( def _contrastive_search(
self, self,
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
top_k: Optional[int] = 1, logits_processor: LogitsProcessorList,
penalty_alpha: Optional[float] = 0, stopping_criteria: StoppingCriteriaList,
logits_processor: Optional[LogitsProcessorList] = None, generation_config: GenerationConfig,
logits_warper: Optional[LogitsProcessorList] = None, synced_gpus: bool,
stopping_criteria: Optional[StoppingCriteriaList] = None, streamer: Optional["BaseStreamer"],
pad_token_id: Optional[int] = None,
eos_token_id: Optional[Union[int, List[int]]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_scores: Optional[bool] = None,
output_logits: Optional[bool] = None,
return_dict_in_generate: Optional[bool] = None,
synced_gpus: bool = False,
streamer: Optional["BaseStreamer"] = None,
sequential: Optional[bool] = None,
**model_kwargs, **model_kwargs,
) -> Union[GenerateNonBeamOutput, torch.LongTensor]: ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
r""" r"""
Generates sequences of token ids for models with a language modeling head using **contrastive search** and can Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
<Tip warning={true}>
In most cases, you do not need to call [`~generation.GenerationMixin._contrastive_search`] directly. Use
generate() instead. For an overview of generation strategies and code examples, check the [following
guide](../generation_strategies).
</Tip>
Parameters: Parameters:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation. The sequence used as a prompt for the generation.
top_k (`int`, *optional*, defaults to 1): logits_processor (`LogitsProcessorList`):
The size of the candidate set that is used to re-rank for contrastive search
penalty_alpha (`float`, *optional*, defaults to 0):
The degeneration penalty for contrastive search; activate when it is larger than 0
logits_processor (`LogitsProcessorList`, *optional*):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
used to modify the prediction scores of the language modeling head applied at each generation step. used to modify the prediction scores of the language modeling head applied at each generation step.
logits_warper (`LogitsProcessorList`, *optional*): stopping_criteria (`StoppingCriteriaList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
to warp the prediction score distribution of the language modeling head applied before multinomial
sampling at each generation step.
stopping_criteria (`StoppingCriteriaList`, *optional*):
An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
used to tell if the generation loop should stop. used to tell if the generation loop should stop.
pad_token_id (`int`, *optional*): generation_config ([`~generation.GenerationConfig`]):
The id of the *padding* token. The generation configuration to be used as parametrization of the decoding method.
eos_token_id (`Union[int, List[int]]`, *optional*): synced_gpus (`bool`):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
output_hidden_states (`bool`, *optional*, defaults to `False`):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
output_scores (`bool`, *optional*, defaults to `False`):
Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
output_logits (`bool`, *optional*, defaults to `False`):
Whether or not to return the raw prediction logit scores. See `logits` under returned tensors
for more details.
return_dict_in_generate (`bool`, *optional*, defaults to `False`):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3) Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
streamer (`BaseStreamer`, *optional*): streamer (`BaseStreamer`, *optional*):
Streamer object that will be used to stream the generated sequences. Generated tokens are passed Streamer object that will be used to stream the generated sequences. Generated tokens are passed
through `streamer.put(token_ids)` and the streamer is responsible for any further processing. through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
sequential (`bool`, *optional*):
Switches topk hidden state computation from parallel to sequential to reduce memory if True.
model_kwargs: model_kwargs:
Additional model specific keyword arguments will be forwarded to the `forward` function of the model. Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
If model is an encoder-decoder model the kwargs should include `encoder_outputs`. If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
...@@ -1983,69 +1904,18 @@ class GenerationMixin: ...@@ -1983,69 +1904,18 @@ class GenerationMixin:
[`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
`return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
`model.config.is_encoder_decoder=True`. `model.config.is_encoder_decoder=True`.
"""
Examples:
```python
>>> from transformers import (
... AutoTokenizer,
... AutoModelForCausalLM,
... StoppingCriteriaList,
... MaxLengthCriteria,
... )
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
>>> model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
>>> # set pad_token_id to eos_token_id because OPT does not have a PAD token
>>> model.config.pad_token_id = model.config.eos_token_id
>>> input_prompt = "DeepMind Company is"
>>> input_ids = tokenizer(input_prompt, return_tensors="pt")
>>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=64)])
>>> outputs = model._contrastive_search(
... **input_ids, penalty_alpha=0.6, top_k=4, stopping_criteria=stopping_criteria
... )
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['DeepMind Company is a company that focuses on the development and commercialization of artificial intelligence (AI). DeepMind’s mission is to help people understand and solve problems that are difficult to solve in the world today.\n\nIn this post, we talk about the benefits of deep learning in business and how it']
```"""
# init values # init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList() top_k = generation_config.top_k
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() penalty_alpha = generation_config.penalty_alpha
pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id pad_token_id = generation_config.pad_token_id
if eos_token_id is not None: output_attentions = generation_config.output_attentions
logger.warning_once( output_hidden_states = generation_config.output_hidden_states
"`eos_token_id` is deprecated in this function and will be removed in v4.41, use" output_scores = generation_config.output_scores
" `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead." output_logits = generation_config.output_logits
" Otherwise make sure to set `model.generation_config.eos_token_id`", return_dict_in_generate = generation_config.return_dict_in_generate
FutureWarning, sequential = generation_config.low_memory
)
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
else:
# TODO remove when the method is totally private
# need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
eos_token_id = [
criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id")
]
eos_token_id = eos_token_id[0] if eos_token_id else None
if eos_token_id is None and self.generation_config.eos_token_id is not None:
eos_token_id = self.generation_config.eos_token_id
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id]
sequential = sequential if sequential is not None else self.generation_config.low_memory
output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
output_attentions = (
output_attentions if output_attentions is not None else self.generation_config.output_attentions
)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
)
return_dict_in_generate = (
return_dict_in_generate
if return_dict_in_generate is not None
else self.generation_config.return_dict_in_generate
)
# init attention / hidden states / scores tuples # init attention / hidden states / scores tuples
raw_logits = () if (return_dict_in_generate and output_logits) else None raw_logits = () if (return_dict_in_generate and output_logits) else None
...@@ -2123,7 +1993,6 @@ class GenerationMixin: ...@@ -2123,7 +1993,6 @@ class GenerationMixin:
# contrastive search decoding consists of two steps: (1) candidate tokens recall; (2) candidate re-rank by # contrastive search decoding consists of two steps: (1) candidate tokens recall; (2) candidate re-rank by
# degeneration penalty # degeneration penalty
processed_logit_for_next_step = logits_processor(input_ids, logit_for_next_step) processed_logit_for_next_step = logits_processor(input_ids, logit_for_next_step)
processed_logit_for_next_step = logits_warper(input_ids, processed_logit_for_next_step)
next_probs = nn.functional.softmax(processed_logit_for_next_step, dim=-1) next_probs = nn.functional.softmax(processed_logit_for_next_step, dim=-1)
top_k_probs, top_k_ids = torch.topk(next_probs, dim=-1, k=top_k) top_k_probs, top_k_ids = torch.topk(next_probs, dim=-1, k=top_k)
...@@ -2294,9 +2163,7 @@ class GenerationMixin: ...@@ -2294,9 +2163,7 @@ class GenerationMixin:
continue # don't waste resources running the code we don't need continue # don't waste resources running the code we don't need
# finished sentences should have their next token be a padding token # finished sentences should have their next token be a padding token
if eos_token_id is not None: if has_eos_stopping_criteria:
if pad_token_id is None:
raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
# update generated ids, model inputs, and length for next step # update generated ids, model inputs, and length for next step
...@@ -2352,74 +2219,32 @@ class GenerationMixin: ...@@ -2352,74 +2219,32 @@ class GenerationMixin:
else: else:
return input_ids return input_ids
def greedy_search(self, *args, **kwargs):
logger.warning_once(
"Calling `greedy_search` directly is deprecated and will be removed in v4.41. Use `generate` or a "
"custom generation loop instead.",
)
return self._greedy_search(*args, **kwargs)
def _greedy_search( def _greedy_search(
self, self,
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: LogitsProcessorList,
stopping_criteria: Optional[StoppingCriteriaList] = None, stopping_criteria: StoppingCriteriaList,
max_length: Optional[int] = None, generation_config: GenerationConfig,
pad_token_id: Optional[int] = None, synced_gpus: bool,
eos_token_id: Optional[Union[int, List[int]]] = None, streamer: Optional["BaseStreamer"],
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_scores: Optional[bool] = None,
output_logits: Optional[bool] = None,
return_dict_in_generate: Optional[bool] = None,
synced_gpus: bool = False,
streamer: Optional["BaseStreamer"] = None,
**model_kwargs, **model_kwargs,
) -> Union[GenerateNonBeamOutput, torch.LongTensor]: ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
r""" r"""
Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
<Tip warning={true}>
In most cases, you do not need to call [`~generation.GenerationMixin._greedy_search`] directly. Use generate()
instead. For an overview of generation strategies and code examples, check the [following
guide](../generation_strategies).
</Tip>
Parameters: Parameters:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation. The sequence used as a prompt for the generation.
logits_processor (`LogitsProcessorList`, *optional*): logits_processor (`LogitsProcessorList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
used to modify the prediction scores of the language modeling head applied at each generation step. used to modify the prediction scores of the language modeling head applied at each generation step.
stopping_criteria (`StoppingCriteriaList`, *optional*): stopping_criteria (`StoppingCriteriaList`):
An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
used to tell if the generation loop should stop. used to tell if the generation loop should stop.
generation_config ([`~generation.GenerationConfig`]):
max_length (`int`, *optional*, defaults to 20): The generation configuration to be used as parametrization of the decoding method.
**DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated synced_gpus (`bool`):
tokens. The maximum length of the sequence to be generated.
pad_token_id (`int`, *optional*):
The id of the *padding* token.
eos_token_id (`Union[int, List[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
output_hidden_states (`bool`, *optional*, defaults to `False`):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
output_scores (`bool`, *optional*, defaults to `False`):
Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
output_logits (`bool`, *optional*, defaults to `False`):
Whether or not to return the raw prediction logit scores. See `logits` under returned tensors
for more details.
return_dict_in_generate (`bool`, *optional*, defaults to `False`):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3) Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
streamer (`BaseStreamer`, *optional*): streamer (`BaseStreamer`, *optional*):
Streamer object that will be used to stream the generated sequences. Generated tokens are passed Streamer object that will be used to stream the generated sequences. Generated tokens are passed
...@@ -2434,87 +2259,15 @@ class GenerationMixin: ...@@ -2434,87 +2259,15 @@ class GenerationMixin:
[`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
`return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
`model.config.is_encoder_decoder=True`. `model.config.is_encoder_decoder=True`.
"""
Examples:
```python
>>> from transformers import (
... AutoTokenizer,
... AutoModelForCausalLM,
... LogitsProcessorList,
... MinLengthLogitsProcessor,
... StoppingCriteriaList,
... MaxLengthCriteria,
... )
>>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
>>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
>>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
>>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
>>> input_prompt = "It might be possible to"
>>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
>>> # instantiate logits processors
>>> logits_processor = LogitsProcessorList(
... [
... MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
... ]
... )
>>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
>>> outputs = model._greedy_search(
... input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria
... )
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
["It might be possible to get a better understanding of the nature of the problem, but it's not"]
```"""
# init values # init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() pad_token_id = generation_config.pad_token_id
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() output_attentions = generation_config.output_attentions
if max_length is not None: output_hidden_states = generation_config.output_hidden_states
warnings.warn( output_scores = generation_config.output_scores
"`max_length` is deprecated in this function, use" output_logits = generation_config.output_logits
" `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", return_dict_in_generate = generation_config.return_dict_in_generate
UserWarning, has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
)
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
if eos_token_id is not None:
logger.warning_once(
"`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
" `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
" Otherwise make sure to set `model.generation_config.eos_token_id`",
FutureWarning,
)
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
else:
# TODO remove when the method is totally private
# need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
eos_token_id = [
criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id")
]
eos_token_id = eos_token_id[0] if eos_token_id else None
if eos_token_id is None and self.generation_config.eos_token_id is not None:
eos_token_id = self.generation_config.eos_token_id
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id]
output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
output_attentions = (
output_attentions if output_attentions is not None else self.generation_config.output_attentions
)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
)
return_dict_in_generate = (
return_dict_in_generate
if return_dict_in_generate is not None
else self.generation_config.return_dict_in_generate
)
# init attention / hidden states / scores tuples # init attention / hidden states / scores tuples
raw_logits = () if (return_dict_in_generate and output_logits) else None raw_logits = () if (return_dict_in_generate and output_logits) else None
...@@ -2580,9 +2333,7 @@ class GenerationMixin: ...@@ -2580,9 +2333,7 @@ class GenerationMixin:
next_tokens = torch.argmax(next_tokens_scores, dim=-1) next_tokens = torch.argmax(next_tokens_scores, dim=-1)
# finished sentences should have their next token be a padding token # finished sentences should have their next token be a padding token
if eos_token_id is not None: if has_eos_stopping_criteria:
if pad_token_id is None:
raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
# update generated ids, model inputs, and length for next step # update generated ids, model inputs, and length for next step
...@@ -2626,77 +2377,37 @@ class GenerationMixin: ...@@ -2626,77 +2377,37 @@ class GenerationMixin:
else: else:
return input_ids return input_ids
def sample(self, *args, **kwargs):
logger.warning_once(
"Calling `sample` directly is deprecated and will be removed in v4.41. Use `generate` or a "
"custom generation loop instead.",
)
return self._sample(*args, **kwargs)
def _sample( def _sample(
self, self,
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: LogitsProcessorList,
stopping_criteria: Optional[StoppingCriteriaList] = None, stopping_criteria: StoppingCriteriaList,
logits_warper: Optional[LogitsProcessorList] = None, logits_warper: LogitsProcessorList,
max_length: Optional[int] = None, generation_config: GenerationConfig,
pad_token_id: Optional[int] = None, synced_gpus: bool,
eos_token_id: Optional[Union[int, List[int]]] = None, streamer: Optional["BaseStreamer"],
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_scores: Optional[bool] = None,
output_logits: Optional[bool] = None,
return_dict_in_generate: Optional[bool] = None,
synced_gpus: bool = False,
streamer: Optional["BaseStreamer"] = None,
**model_kwargs, **model_kwargs,
) -> Union[GenerateNonBeamOutput, torch.LongTensor]: ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
r""" r"""
Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
<Tip warning={true}>
In most cases, you do not need to call [`~generation.GenerationMixin._sample`] directly. Use generate() instead.
For an overview of generation strategies and code examples, check the [following
guide](../generation_strategies).
</Tip>
Parameters: Parameters:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation. The sequence used as a prompt for the generation.
logits_processor (`LogitsProcessorList`, *optional*): logits_processor (`LogitsProcessorList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
used to modify the prediction scores of the language modeling head applied at each generation step. used to modify the prediction scores of the language modeling head applied at each generation step.
stopping_criteria (`StoppingCriteriaList`, *optional*): stopping_criteria (`StoppingCriteriaList`):
An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
used to tell if the generation loop should stop. used to tell if the generation loop should stop.
logits_warper (`LogitsProcessorList`, *optional*): logits_warper (`LogitsProcessorList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
to warp the prediction score distribution of the language modeling head applied before multinomial to warp the prediction score distribution of the language modeling head applied before multinomial
sampling at each generation step. sampling at each generation step.
max_length (`int`, *optional*, defaults to 20): generation_config ([`~generation.GenerationConfig`]):
**DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated The generation configuration to be used as parametrization of the decoding method.
tokens. The maximum length of the sequence to be generated. synced_gpus (`bool`):
pad_token_id (`int`, *optional*):
The id of the *padding* token.
eos_token_id (`Union[int, List[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
output_hidden_states (`bool`, *optional*, defaults to `False`):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
output_scores (`bool`, *optional*, defaults to `False`):
Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
output_logits (`bool`, *optional*, defaults to `False`):
Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
more details.
return_dict_in_generate (`bool`, *optional*, defaults to `False`):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3) Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
streamer (`BaseStreamer`, *optional*): streamer (`BaseStreamer`, *optional*):
Streamer object that will be used to stream the generated sequences. Generated tokens are passed Streamer object that will be used to stream the generated sequences. Generated tokens are passed
...@@ -2711,105 +2422,15 @@ class GenerationMixin: ...@@ -2711,105 +2422,15 @@ class GenerationMixin:
[`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
`return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
`model.config.is_encoder_decoder=True`. `model.config.is_encoder_decoder=True`.
"""
Examples:
```python
>>> from transformers import (
... AutoTokenizer,
... AutoModelForCausalLM,
... LogitsProcessorList,
... MinLengthLogitsProcessor,
... TopKLogitsWarper,
... TemperatureLogitsWarper,
... StoppingCriteriaList,
... MaxLengthCriteria,
... )
>>> import torch
>>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
>>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
>>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
>>> model.config.pad_token_id = model.config.eos_token_id
>>> model.generation_config.pad_token_id = model.config.eos_token_id
>>> input_prompt = "Today is a beautiful day, and"
>>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
>>> # instantiate logits processors
>>> logits_processor = LogitsProcessorList(
... [
... MinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
... ]
... )
>>> # instantiate logits processors
>>> logits_warper = LogitsProcessorList(
... [
... TopKLogitsWarper(50),
... TemperatureLogitsWarper(0.7),
... ]
... )
>>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
>>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
>>> outputs = model._sample(
... input_ids,
... logits_processor=logits_processor,
... logits_warper=logits_warper,
... stopping_criteria=stopping_criteria,
... )
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['Today is a beautiful day, and we must do everything possible to make it a day of celebration.']
```"""
# init values # init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() pad_token_id = generation_config.pad_token_id
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() output_attentions = generation_config.output_attentions
if max_length is not None: output_hidden_states = generation_config.output_hidden_states
warnings.warn( output_scores = generation_config.output_scores
"`max_length` is deprecated in this function, use" output_logits = generation_config.output_logits
" `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", return_dict_in_generate = generation_config.return_dict_in_generate
UserWarning, has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
)
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
if eos_token_id is not None:
logger.warning_once(
"`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
" `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
" Otherwise make sure to set `model.generation_config.eos_token_id`",
FutureWarning,
)
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
else:
# TODO remove when the method is totally private
# need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
eos_token_id = [
criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id")
]
eos_token_id = eos_token_id[0] if eos_token_id else None
if eos_token_id is None and self.generation_config.eos_token_id is not None:
eos_token_id = self.generation_config.eos_token_id
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id]
output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
output_attentions = (
output_attentions if output_attentions is not None else self.generation_config.output_attentions
)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
)
return_dict_in_generate = (
return_dict_in_generate
if return_dict_in_generate is not None
else self.generation_config.return_dict_in_generate
)
# init attention / hidden states / scores tuples # init attention / hidden states / scores tuples
scores = () if (return_dict_in_generate and output_scores) else None scores = () if (return_dict_in_generate and output_scores) else None
...@@ -2877,9 +2498,7 @@ class GenerationMixin: ...@@ -2877,9 +2498,7 @@ class GenerationMixin:
next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
# finished sentences should have their next token be a padding token # finished sentences should have their next token be a padding token
if eos_token_id is not None: if has_eos_stopping_criteria:
if pad_token_id is None:
raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
# update generated ids, model inputs, and length for next step # update generated ids, model inputs, and length for next step
...@@ -2949,81 +2568,36 @@ class GenerationMixin: ...@@ -2949,81 +2568,36 @@ class GenerationMixin:
past_key_values.reorder_cache(beam_idx) past_key_values.reorder_cache(beam_idx)
return past_key_values return past_key_values
def beam_search(self, *args, **kwargs):
logger.warning_once(
"Calling `beam_search` directly is deprecated and will be removed in v4.41. Use `generate` or a "
"custom generation loop instead.",
)
return self._beam_search(*args, **kwargs)
def _beam_search( def _beam_search(
self, self,
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
beam_scorer: BeamScorer, beam_scorer: BeamScorer,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: LogitsProcessorList,
stopping_criteria: Optional[StoppingCriteriaList] = None, stopping_criteria: StoppingCriteriaList,
max_length: Optional[int] = None, generation_config: GenerationConfig,
pad_token_id: Optional[int] = None, synced_gpus: bool,
eos_token_id: Optional[Union[int, List[int]]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_scores: Optional[bool] = None,
output_logits: Optional[bool] = None,
return_dict_in_generate: Optional[bool] = None,
synced_gpus: bool = False,
sequential: Optional[bool] = None,
**model_kwargs, **model_kwargs,
) -> Union[GenerateBeamOutput, torch.LongTensor]: ) -> Union[GenerateBeamOutput, torch.LongTensor]:
r""" r"""
Generates sequences of token ids for models with a language modeling head using **beam search decoding** and Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
<Tip warning={true}>
In most cases, you do not need to call [`~generation.GenerationMixin._beam_search`] directly. Use generate()
instead. For an overview of generation strategies and code examples, check the [following
guide](../generation_strategies).
</Tip>
Parameters: Parameters:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation. The sequence used as a prompt for the generation.
beam_scorer (`BeamScorer`): beam_scorer (`BeamScorer`):
An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
logits_processor (`LogitsProcessorList`, *optional*): logits_processor (`LogitsProcessorList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
used to modify the prediction scores of the language modeling head applied at each generation step. used to modify the prediction scores of the language modeling head applied at each generation step.
stopping_criteria (`StoppingCriteriaList`, *optional*): stopping_criteria (`StoppingCriteriaList`:
An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
used to tell if the generation loop should stop. used to tell if the generation loop should stop.
max_length (`int`, *optional*, defaults to 20): generation_config ([`~generation.GenerationConfig`]):
**DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated The generation configuration to be used as parametrization of the decoding method.
tokens. The maximum length of the sequence to be generated. synced_gpus (`bool`):
pad_token_id (`int`, *optional*):
The id of the *padding* token.
eos_token_id (`Union[int, List[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
output_hidden_states (`bool`, *optional*, defaults to `False`):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
output_logits (`bool`, *optional*, defaults to `False`):
Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
more details.
output_scores (`bool`, *optional*, defaults to `False`):
Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
return_dict_in_generate (`bool`, *optional*, defaults to `False`):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3) Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
sequential (`bool`, defaults to `False`):
By default, beam search has `batch_size * num_beams` as effective batch size (see `beam_search()` for
more details). This flag will avoid parallelizing the beam search and will instead run beam search
sequentially.
model_kwargs: model_kwargs:
Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
an encoder-decoder model the kwargs should include `encoder_outputs`. an encoder-decoder model the kwargs should include `encoder_outputs`.
...@@ -3034,107 +2608,19 @@ class GenerationMixin: ...@@ -3034,107 +2608,19 @@ class GenerationMixin:
[`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
`return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
`model.config.is_encoder_decoder=True`. `model.config.is_encoder_decoder=True`.
"""
Examples:
```python
>>> from transformers import (
... AutoTokenizer,
... AutoModelForSeq2SeqLM,
... LogitsProcessorList,
... MinLengthLogitsProcessor,
... BeamSearchScorer,
... )
>>> import torch
>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
>>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
>>> encoder_input_str = "translate English to German: How old are you?"
>>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
>>> # lets run beam search using 3 beams
>>> num_beams = 3
>>> # define decoder start token ids
>>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
>>> input_ids = input_ids * model.config.decoder_start_token_id
>>> # add encoder_outputs to model keyword arguments
>>> model_kwargs = {
... "encoder_outputs": model.get_encoder()(
... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
... )
... }
>>> # instantiate beam scorer
>>> beam_scorer = BeamSearchScorer(
... batch_size=1,
... num_beams=num_beams,
... device=model.device,
... )
>>> # instantiate logits processors
>>> logits_processor = LogitsProcessorList(
... [
... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
... ]
... )
>>> outputs = model._beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['Wie alt bist du?']
```"""
# init values # init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() pad_token_id = generation_config.pad_token_id
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() eos_token_id = generation_config.eos_token_id
sequential = sequential if sequential is not None else self.generation_config.low_memory output_attentions = generation_config.output_attentions
if max_length is not None: output_hidden_states = generation_config.output_hidden_states
warnings.warn( output_scores = generation_config.output_scores
"`max_length` is deprecated in this function, use" output_logits = generation_config.output_logits
" `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", return_dict_in_generate = generation_config.return_dict_in_generate
UserWarning, sequential = generation_config.low_memory
)
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
if len(stopping_criteria) == 0:
warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
if eos_token_id is not None:
logger.warning_once(
"`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
" `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
" Otherwise make sure to set `model.generation_config.eos_token_id`",
FutureWarning,
)
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
else:
# TODO remove when the method is totally private and beam scorer refactored
# need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
eos_token_id = [
criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id")
]
eos_token_id = eos_token_id[0] if eos_token_id else None
if eos_token_id is None and self.generation_config.eos_token_id is not None:
eos_token_id = self.generation_config.eos_token_id
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
if isinstance(eos_token_id, int): if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id] eos_token_id = [eos_token_id]
output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
output_attentions = (
output_attentions if output_attentions is not None else self.generation_config.output_attentions
)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
)
return_dict_in_generate = (
return_dict_in_generate
if return_dict_in_generate is not None
else self.generation_config.return_dict_in_generate
)
batch_size = len(beam_scorer._beam_hyps) batch_size = len(beam_scorer._beam_hyps)
num_beams = beam_scorer.num_beams num_beams = beam_scorer.num_beams
...@@ -3348,80 +2834,40 @@ class GenerationMixin: ...@@ -3348,80 +2834,40 @@ class GenerationMixin:
else: else:
return sequence_outputs["sequences"] return sequence_outputs["sequences"]
def beam_sample(self, *args, **kwargs):
logger.warning_once(
"Calling `beam_sample` directly is deprecated and will be removed in v4.41. Use `generate` or a "
"custom generation loop instead.",
)
return self._beam_sample(*args, **kwargs)
def _beam_sample( def _beam_sample(
self, self,
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
beam_scorer: BeamScorer, beam_scorer: BeamScorer,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: LogitsProcessorList,
stopping_criteria: Optional[StoppingCriteriaList] = None, stopping_criteria: StoppingCriteriaList,
logits_warper: Optional[LogitsProcessorList] = None, logits_warper: LogitsProcessorList,
max_length: Optional[int] = None, generation_config: GenerationConfig,
pad_token_id: Optional[int] = None, synced_gpus: bool,
eos_token_id: Optional[Union[int, List[int]]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_scores: Optional[bool] = None,
output_logits: Optional[bool] = None,
return_dict_in_generate: Optional[bool] = None,
synced_gpus: bool = False,
**model_kwargs, **model_kwargs,
) -> Union[GenerateBeamOutput, torch.LongTensor]: ) -> Union[GenerateBeamOutput, torch.LongTensor]:
r""" r"""
Generates sequences of token ids for models with a language modeling head using **beam search multinomial Generates sequences of token ids for models with a language modeling head using **beam search multinomial
sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
<Tip warning={true}>
In most cases, you do not need to call [`~generation.GenerationMixin._beam_sample`] directly. Use generate()
instead. For an overview of generation strategies and code examples, check the [following
guide](../generation_strategies).
</Tip>
Parameters: Parameters:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation. The sequence used as a prompt for the generation.
beam_scorer (`BeamScorer`): beam_scorer (`BeamScorer`):
A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
logits_processor (`LogitsProcessorList`, *optional*): logits_processor (`LogitsProcessorList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
used to modify the prediction scores of the language modeling head applied at each generation step. used to modify the prediction scores of the language modeling head applied at each generation step.
stopping_criteria (`StoppingCriteriaList`, *optional*): stopping_criteria (`StoppingCriteriaList`):
An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
used to tell if the generation loop should stop. used to tell if the generation loop should stop.
logits_warper (`LogitsProcessorList`, *optional*): logits_warper (`LogitsProcessorList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
to warp the prediction score distribution of the language modeling head applied before multinomial to warp the prediction score distribution of the language modeling head applied before multinomial
sampling at each generation step. sampling at each generation step.
max_length (`int`, *optional*, defaults to 20): generation_config ([`~generation.GenerationConfig`]):
**DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated The generation configuration to be used as parametrization of the decoding method.
tokens. The maximum length of the sequence to be generated. synced_gpus (`bool`):
pad_token_id (`int`, *optional*):
The id of the *padding* token.
eos_token_id (`Union[int, List[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
output_hidden_states (`bool`, *optional*, defaults to `False`):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
output_scores (`bool`, *optional*, defaults to `False`):
Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
output_logits (`bool`, *optional*, defaults to `False`):
Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
more details.
return_dict_in_generate (`bool`, *optional*, defaults to `False`):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3) Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
model_kwargs: model_kwargs:
Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
...@@ -3433,112 +2879,18 @@ class GenerationMixin: ...@@ -3433,112 +2879,18 @@ class GenerationMixin:
[`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
`return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
`model.config.is_encoder_decoder=True`. `model.config.is_encoder_decoder=True`.
"""
Examples:
```python
>>> from transformers import (
... AutoTokenizer,
... AutoModelForSeq2SeqLM,
... LogitsProcessorList,
... MinLengthLogitsProcessor,
... TopKLogitsWarper,
... TemperatureLogitsWarper,
... BeamSearchScorer,
... )
>>> import torch
>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
>>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
>>> encoder_input_str = "translate English to German: How old are you?"
>>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
>>> # lets run beam search using 3 beams
>>> num_beams = 3
>>> # define decoder start token ids
>>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
>>> input_ids = input_ids * model.config.decoder_start_token_id
>>> # add encoder_outputs to model keyword arguments
>>> model_kwargs = {
... "encoder_outputs": model.get_encoder()(
... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
... )
... }
>>> # instantiate beam scorer
>>> beam_scorer = BeamSearchScorer(
... batch_size=1,
... max_length=model.config.max_length,
... num_beams=num_beams,
... device=model.device,
... )
>>> # instantiate logits processors
>>> logits_processor = LogitsProcessorList(
... [MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)]
... )
>>> # instantiate logits processors
>>> logits_warper = LogitsProcessorList(
... [
... TopKLogitsWarper(50),
... TemperatureLogitsWarper(0.7),
... ]
... )
>>> outputs = model._beam_sample(
... input_ids, beam_scorer, logits_processor=logits_processor, logits_warper=logits_warper, **model_kwargs
... )
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['Wie alt bist du?']
```"""
# init values # init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() pad_token_id = generation_config.pad_token_id
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() eos_token_id = generation_config.eos_token_id
if max_length is not None: output_attentions = generation_config.output_attentions
warnings.warn( output_hidden_states = generation_config.output_hidden_states
"`max_length` is deprecated in this function, use" output_scores = generation_config.output_scores
" `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", output_logits = generation_config.output_logits
UserWarning, return_dict_in_generate = generation_config.return_dict_in_generate
)
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
if eos_token_id is not None:
logger.warning_once(
"`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
" `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
" Otherwise make sure to set `model.generation_config.eos_token_id`",
FutureWarning,
)
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
else:
# TODO remove when the method is totally private and beam scorer refactored
# need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
eos_token_id = [
criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id")
]
eos_token_id = eos_token_id[0] if eos_token_id else None
if eos_token_id is None and self.generation_config.eos_token_id is not None:
eos_token_id = self.generation_config.eos_token_id
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
if isinstance(eos_token_id, int): if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id] eos_token_id = [eos_token_id]
output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
output_attentions = (
output_attentions if output_attentions is not None else self.generation_config.output_attentions
)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
)
return_dict_in_generate = (
return_dict_in_generate
if return_dict_in_generate is not None
else self.generation_config.return_dict_in_generate
)
batch_size = len(beam_scorer._beam_hyps) batch_size = len(beam_scorer._beam_hyps)
num_beams = beam_scorer.num_beams num_beams = beam_scorer.num_beams
...@@ -3710,77 +3062,36 @@ class GenerationMixin: ...@@ -3710,77 +3062,36 @@ class GenerationMixin:
else: else:
return sequence_outputs["sequences"] return sequence_outputs["sequences"]
def group_beam_search(self, *args, **kwargs):
logger.warning_once(
"Calling `group_beam_search` directly is deprecated and will be removed in v4.41. Use `generate` or a "
"custom generation loop instead.",
)
return self._group_beam_search(*args, **kwargs)
def _group_beam_search( def _group_beam_search(
self, self,
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
beam_scorer: BeamScorer, beam_scorer: BeamScorer,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: LogitsProcessorList,
stopping_criteria: Optional[StoppingCriteriaList] = None, stopping_criteria: StoppingCriteriaList,
max_length: Optional[int] = None, generation_config: GenerationConfig,
pad_token_id: Optional[int] = None, synced_gpus: bool,
eos_token_id: Optional[Union[int, List[int]]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_scores: Optional[bool] = None,
output_logits: Optional[bool] = None,
return_dict_in_generate: Optional[bool] = None,
synced_gpus: bool = False,
**model_kwargs, **model_kwargs,
): ):
r""" r"""
Generates sequences of token ids for models with a language modeling head using **diverse beam search Generates sequences of token ids for models with a language modeling head using **diverse beam search
decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
<Tip warning={true}>
In most cases, you do not need to call [`~generation.GenerationMixin._group_beam_search`] directly. Use
generate() instead. For an overview of generation strategies and code examples, check the [following
guide](../generation_strategies).
</Tip>
Parameters: Parameters:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation. The sequence used as a prompt for the generation.
beam_scorer (`BeamScorer`): beam_scorer (`BeamScorer`):
An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
logits_processor (`LogitsProcessorList`, *optional*): logits_processor (`LogitsProcessorList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
used to modify the prediction scores of the language modeling head applied at each generation step. used to modify the prediction scores of the language modeling head applied at each generation step.
stopping_criteria (`StoppingCriteriaList`, *optional*): stopping_criteria (`StoppingCriteriaList`):
An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
used to tell if the generation loop should stop. used to tell if the generation loop should stop.
max_length (`int`, *optional*, defaults to 20): generation_config ([`~generation.GenerationConfig`]):
**DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated The generation configuration to be used as parametrization of the decoding method.
tokens. The maximum length of the sequence to be generated. synced_gpus (`bool`):
pad_token_id (`int`, *optional*):
The id of the *padding* token.
eos_token_id (`Union[int, List[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
output_hidden_states (`bool`, *optional*, defaults to `False`):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
output_scores (`bool`, *optional*, defaults to `False`):
Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
output_logits (`bool`, *optional*, defaults to `False`):
Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
more details.
return_dict_in_generate (`bool`, *optional*, defaults to `False`):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3) Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
model_kwargs: model_kwargs:
Additional model specific kwargs that will be forwarded to the `forward` function of the model. If Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
model is an encoder-decoder model the kwargs should include `encoder_outputs`. model is an encoder-decoder model the kwargs should include `encoder_outputs`.
...@@ -3791,109 +3102,18 @@ class GenerationMixin: ...@@ -3791,109 +3102,18 @@ class GenerationMixin:
[`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
`return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
`model.config.is_encoder_decoder=True`. `model.config.is_encoder_decoder=True`.
"""
Examples:
```python
>>> from transformers import (
... AutoTokenizer,
... AutoModelForSeq2SeqLM,
... LogitsProcessorList,
... MinLengthLogitsProcessor,
... HammingDiversityLogitsProcessor,
... BeamSearchScorer,
... )
>>> import torch
>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
>>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
>>> encoder_input_str = "translate English to German: How old are you?"
>>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
>>> # lets run diverse beam search using 6 beams
>>> num_beams = 6
>>> # define decoder start token ids
>>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
>>> input_ids = input_ids * model.config.decoder_start_token_id
>>> # add encoder_outputs to model keyword arguments
>>> model_kwargs = {
... "encoder_outputs": model.get_encoder()(
... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
... )
... }
>>> # instantiate beam scorer
>>> beam_scorer = BeamSearchScorer(
... batch_size=1,
... max_length=model.config.max_length,
... num_beams=num_beams,
... device=model.device,
... num_beam_groups=3,
... )
>>> # instantiate logits processors
>>> logits_processor = LogitsProcessorList(
... [
... HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3),
... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
... ]
... )
>>> outputs = model._group_beam_search(
... input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs
... )
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['Wie alt bist du?']
```"""
# init values # init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() pad_token_id = generation_config.pad_token_id
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() eos_token_id = generation_config.eos_token_id
if max_length is not None: output_attentions = generation_config.output_attentions
warnings.warn( output_hidden_states = generation_config.output_hidden_states
"`max_length` is deprecated in this function, use" output_scores = generation_config.output_scores
" `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", output_logits = generation_config.output_logits
UserWarning, return_dict_in_generate = generation_config.return_dict_in_generate
)
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
if eos_token_id is not None:
logger.warning_once(
"`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
" `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
" Otherwise make sure to set `model.generation_config.eos_token_id`",
FutureWarning,
)
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
else:
# TODO remove when the method is totally private and beam scorer refactored
# need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
eos_token_id = [
criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id")
]
eos_token_id = eos_token_id[0] if eos_token_id else None
if eos_token_id is None and self.generation_config.eos_token_id is not None:
eos_token_id = self.generation_config.eos_token_id
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
if isinstance(eos_token_id, int): if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id] eos_token_id = [eos_token_id]
output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
output_attentions = (
output_attentions if output_attentions is not None else self.generation_config.output_attentions
)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
)
return_dict_in_generate = (
return_dict_in_generate
if return_dict_in_generate is not None
else self.generation_config.return_dict_in_generate
)
num_beams = beam_scorer.num_beams num_beams = beam_scorer.num_beams
num_beam_groups = beam_scorer.num_beam_groups num_beam_groups = beam_scorer.num_beam_groups
...@@ -4122,42 +3342,20 @@ class GenerationMixin: ...@@ -4122,42 +3342,20 @@ class GenerationMixin:
else: else:
return sequence_outputs["sequences"] return sequence_outputs["sequences"]
def constrained_beam_search(self, *args, **kwargs):
logger.warning_once(
"Calling `constrained_beam_search` directly is deprecated and will be removed in v4.41. Use `generate` or a "
"custom generation loop instead.",
)
return self._constrained_beam_search(*args, **kwargs)
def _constrained_beam_search( def _constrained_beam_search(
self, self,
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
constrained_beam_scorer: ConstrainedBeamSearchScorer, constrained_beam_scorer: ConstrainedBeamSearchScorer,
logits_processor: Optional[LogitsProcessorList] = None, logits_processor: LogitsProcessorList,
stopping_criteria: Optional[StoppingCriteriaList] = None, stopping_criteria: StoppingCriteriaList,
max_length: Optional[int] = None, generation_config: GenerationConfig,
pad_token_id: Optional[int] = None, synced_gpus: bool,
eos_token_id: Optional[Union[int, List[int]]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_scores: Optional[bool] = None,
output_logits: Optional[bool] = None,
return_dict_in_generate: Optional[bool] = None,
synced_gpus: Optional[bool] = None,
**model_kwargs, **model_kwargs,
) -> Union[GenerateBeamOutput, torch.LongTensor]: ) -> Union[GenerateBeamOutput, torch.LongTensor]:
r""" r"""
Generates sequences of token ids for models with a language modeling head using **constrained beam search Generates sequences of token ids for models with a language modeling head using **constrained beam search
decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
<Tip warning={true}>
In most cases, you do not need to call [`~generation.GenerationMixin._constrained_beam_search`] directly. Use
generate() instead. For an overview of generation strategies and code examples, check the [following
guide](../generation_strategies).
</Tip>
Parameters: Parameters:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation. The sequence used as a prompt for the generation.
...@@ -4165,37 +3363,19 @@ class GenerationMixin: ...@@ -4165,37 +3363,19 @@ class GenerationMixin:
A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
sorted during generation, while satisfying a list of positive constraints. For more information, the sorted during generation, while satisfying a list of positive constraints. For more information, the
documentation of [`ConstrainedBeamSearchScorer`] should be read. documentation of [`ConstrainedBeamSearchScorer`] should be read.
logits_processor (`LogitsProcessorList`, *optional*): logits_processor (`LogitsProcessorList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
used to modify the prediction scores of the language modeling head applied at each generation step. used to modify the prediction scores of the language modeling head applied at each generation step.
stopping_criteria (`StoppingCriteriaList`, *optional*): stopping_criteria (`StoppingCriteriaList`):
An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
used to tell if the generation loop should stop. used to tell if the generation loop should stop.
logits_warper (`LogitsProcessorList`, *optional*): logits_warper (`LogitsProcessorList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
to warp the prediction score distribution of the language modeling head applied before multinomial to warp the prediction score distribution of the language modeling head applied before multinomial
sampling at each generation step. sampling at each generation step.
max_length (`int`, *optional*, defaults to 20): generation_config ([`~generation.GenerationConfig`]):
**DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated The generation configuration to be used as parametrization of the decoding method.
tokens. The maximum length of the sequence to be generated. synced_gpus (`bool`):
pad_token_id (`int`, *optional*):
The id of the *padding* token.
eos_token_id (`Union[int, List[int]]`, *optional*):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
output_hidden_states (`bool`, *optional*, defaults to `False`):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
output_scores (`bool`, *optional*, defaults to `False`):
Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
output_logits (`bool`, *optional*, defaults to `False`):
Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
more details.
return_dict_in_generate (`bool`, *optional*, defaults to `False`):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3) Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
model_kwargs: model_kwargs:
Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
...@@ -4207,112 +3387,18 @@ class GenerationMixin: ...@@ -4207,112 +3387,18 @@ class GenerationMixin:
[`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
`return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
`model.config.is_encoder_decoder=True`. `model.config.is_encoder_decoder=True`.
"""
Examples:
```python
>>> from transformers import (
... AutoTokenizer,
... AutoModelForSeq2SeqLM,
... LogitsProcessorList,
... MinLengthLogitsProcessor,
... ConstrainedBeamSearchScorer,
... PhrasalConstraint,
... )
>>> import torch
>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
>>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
>>> encoder_input_str = "translate English to German: How old are you?"
>>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
>>> # lets run beam search using 3 beams
>>> num_beams = 3
>>> # define decoder start token ids
>>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
>>> input_ids = input_ids * model.config.decoder_start_token_id
>>> # add encoder_outputs to model keyword arguments
>>> model_kwargs = {
... "encoder_outputs": model.get_encoder()(
... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
... )
... }
>>> constraint_str = "Sie"
>>> constraint_token_ids = tokenizer.encode(constraint_str)[:-1] # slice to remove eos token
>>> constraints = [PhrasalConstraint(token_ids=constraint_token_ids)]
>>> # instantiate beam scorer
>>> beam_scorer = ConstrainedBeamSearchScorer(
... batch_size=1, num_beams=num_beams, device=model.device, constraints=constraints
... )
>>> # instantiate logits processors
>>> logits_processor = LogitsProcessorList(
... [
... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
... ]
... )
>>> outputs = model._constrained_beam_search(
... input_ids, beam_scorer, constraints=constraints, logits_processor=logits_processor, **model_kwargs
... )
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
['Wie alt sind Sie?']
```"""
# init values # init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() pad_token_id = generation_config.pad_token_id
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() eos_token_id = generation_config.eos_token_id
if max_length is not None: output_attentions = generation_config.output_attentions
warnings.warn( output_hidden_states = generation_config.output_hidden_states
"`max_length` is deprecated in this function, use" output_scores = generation_config.output_scores
" `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", output_logits = generation_config.output_logits
UserWarning, return_dict_in_generate = generation_config.return_dict_in_generate
)
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
if len(stopping_criteria) == 0:
warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
if eos_token_id is not None:
logger.warning_once(
"`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
" `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
" Otherwise make sure to set `model.generation_config.eos_token_id`",
FutureWarning,
)
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
else:
# TODO remove when the method is totally private and beam scorer refactored
# need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
eos_token_id = [
criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id")
]
eos_token_id = eos_token_id[0] if eos_token_id else None
if eos_token_id is None and self.generation_config.eos_token_id is not None:
eos_token_id = self.generation_config.eos_token_id
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
if isinstance(eos_token_id, int): if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id] eos_token_id = [eos_token_id]
output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
output_attentions = (
output_attentions if output_attentions is not None else self.generation_config.output_attentions
)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
)
return_dict_in_generate = (
return_dict_in_generate
if return_dict_in_generate is not None
else self.generation_config.return_dict_in_generate
)
batch_size = len(constrained_beam_scorer._beam_hyps) batch_size = len(constrained_beam_scorer._beam_hyps)
num_beams = constrained_beam_scorer.num_beams num_beams = constrained_beam_scorer.num_beams
...@@ -4490,30 +3576,16 @@ class GenerationMixin: ...@@ -4490,30 +3576,16 @@ class GenerationMixin:
else: else:
return sequence_outputs["sequences"] return sequence_outputs["sequences"]
def assisted_decoding(self, *args, **kwargs):
logger.warning_once(
"Calling `_assisted_decoding` directly is deprecated and will be removed in v4.41. Use `generate` or a "
"custom generation loop instead.",
)
return self._assisted_decoding(*args, **kwargs)
def _assisted_decoding( def _assisted_decoding(
self, self,
input_ids: torch.LongTensor, input_ids: torch.LongTensor,
candidate_generator: Optional["CandidateGenerator"] = None, candidate_generator: CandidateGenerator,
do_sample: bool = False, logits_processor: LogitsProcessorList,
logits_processor: Optional[LogitsProcessorList] = None, logits_warper: LogitsProcessorList,
logits_warper: Optional[LogitsProcessorList] = None, stopping_criteria: StoppingCriteriaList,
stopping_criteria: Optional[StoppingCriteriaList] = None, generation_config: GenerationConfig,
pad_token_id: Optional[int] = None, synced_gpus: bool,
eos_token_id: Optional[Union[int, List[int]]] = None, streamer: Optional["BaseStreamer"],
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
output_scores: Optional[bool] = None,
output_logits: Optional[bool] = None,
return_dict_in_generate: Optional[bool] = None,
synced_gpus: bool = False,
streamer: Optional["BaseStreamer"] = None,
**model_kwargs, **model_kwargs,
) -> Union[GenerateNonBeamOutput, torch.LongTensor]: ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
r""" r"""
...@@ -4522,50 +3594,25 @@ class GenerationMixin: ...@@ -4522,50 +3594,25 @@ class GenerationMixin:
candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text
models. models.
<Tip warning={true}>
In most cases, you do not need to call [`~generation.GenerationMixin._assisted_decoding`] directly. Use
generate() instead. For an overview of generation strategies and code examples, check the [following
guide](../generation_strategies).
</Tip>
Parameters: Parameters:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation. The sequence used as a prompt for the generation.
candidate_generator (`CandidateGenerator`, *optional*): candidate_generator (`CandidateGenerator`):
A derived instance of [`CandidateGenerator`] that defines how candidate sequences are generated. For A derived instance of [`CandidateGenerator`] that defines how candidate sequences are generated. For
more information, the documentation of [`CandidateGenerator`] should be read. more information, the documentation of [`CandidateGenerator`] should be read.
do_sample (`bool`, *optional*, defaults to `False`): logits_processor (`LogitsProcessorList`):
Whether or not to use sampling ; use greedy decoding otherwise.
logits_processor (`LogitsProcessorList`, *optional*):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
used to modify the prediction scores of the language modeling head applied at each generation step. used to modify the prediction scores of the language modeling head applied at each generation step.
logits_warper (`LogitsProcessorList`, *optional*): logits_warper (`LogitsProcessorList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
to warp the prediction score distribution of the language modeling head applied before multinomial to warp the prediction score distribution of the language modeling head applied before multinomial
sampling at each generation step. sampling at each generation step. Only used if sampling is active.
stopping_criteria (`StoppingCriteriaList`, *optional*): stopping_criteria (`StoppingCriteriaList`):
An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
used to tell if the generation loop should stop. used to tell if the generation loop should stop.
pad_token_id (`int`, *optional*): generation_config ([`~generation.GenerationConfig`]):
The id of the *padding* token. The generation configuration to be used as parametrization of the decoding method.
eos_token_id (`Union[int, List[int]]`, *optional*): synced_gpus (`bool`):
The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
output_attentions (`bool`, *optional*, defaults to `False`):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more details.
output_hidden_states (`bool`, *optional*, defaults to `False`):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more details.
output_scores (`bool`, *optional*, defaults to `False`):
Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
output_logits (`bool`, *optional*, defaults to `False`):
Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
more details.
return_dict_in_generate (`bool`, *optional*, defaults to `False`):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
synced_gpus (`bool`, *optional*, defaults to `False`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3) Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
streamer (`BaseStreamer`, *optional*): streamer (`BaseStreamer`, *optional*):
Streamer object that will be used to stream the generated sequences. Generated tokens are passed Streamer object that will be used to stream the generated sequences. Generated tokens are passed
...@@ -4580,83 +3627,14 @@ class GenerationMixin: ...@@ -4580,83 +3627,14 @@ class GenerationMixin:
[`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
`return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
`model.config.is_encoder_decoder=True`. `model.config.is_encoder_decoder=True`.
"""
Examples:
```python
>>> from transformers import (
... AutoTokenizer,
... AutoModelForCausalLM,
... LogitsProcessorList,
... MinLengthLogitsProcessor,
... StoppingCriteriaList,
... MaxLengthCriteria,
... )
>>> from transformers.generation import AssistedCandidateGenerator
>>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
>>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
>>> assistant_model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
>>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
>>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
>>> input_prompt = "It might be possible to"
>>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
>>> model.generation_config.min_length = 10
>>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
>>> candidate_generator = AssistedCandidateGenerator(
... input_ids=input_ids,
... assistant_model=assistant_model,
... generation_config=model.generation_config,
... model_kwargs={},
... )
>>> outputs = model._assisted_decoding(
... input_ids,
... candidate_generator=candidate_generator,
... stopping_criteria=stopping_criteria,
... )
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
["It might be possible to get a better understanding of the nature of the problem, but it's not"]
```"""
# init values # init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() do_sample = logits_warper is not None
logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList() output_attentions = generation_config.output_attentions
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() output_hidden_states = generation_config.output_hidden_states
pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id output_scores = generation_config.output_scores
if eos_token_id is not None: output_logits = generation_config.output_logits
logger.warning_once( return_dict_in_generate = generation_config.return_dict_in_generate
"`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
" `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
" Otherwise make sure to set `model.generation_config.eos_token_id`",
FutureWarning,
)
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
else:
# TODO remove when the method is totally private and beam scorer refactored
# need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
eos_token_id = [
criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id")
]
eos_token_id = eos_token_id[0] if eos_token_id else None
if eos_token_id is None and self.generation_config.eos_token_id is not None:
eos_token_id = self.generation_config.eos_token_id
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id]
output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
output_logits = output_logits if output_logits is not None else self.generation_config.output_logits
output_attentions = (
output_attentions if output_attentions is not None else self.generation_config.output_attentions
)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
)
return_dict_in_generate = (
return_dict_in_generate
if return_dict_in_generate is not None
else self.generation_config.return_dict_in_generate
)
# init attention / hidden states / scores tuples # init attention / hidden states / scores tuples
scores = () if (return_dict_in_generate and output_scores) else None scores = () if (return_dict_in_generate and output_scores) else None
...@@ -4726,7 +3704,7 @@ class GenerationMixin: ...@@ -4726,7 +3704,7 @@ class GenerationMixin:
if len(logits_processor) > 0: if len(logits_processor) > 0:
for i in range(candidate_length + 1): for i in range(candidate_length + 1):
new_logits[:, i, :] = logits_processor(candidate_input_ids[:, : cur_len + i], new_logits[:, i, :]) new_logits[:, i, :] = logits_processor(candidate_input_ids[:, : cur_len + i], new_logits[:, i, :])
if len(logits_warper) > 0: if do_sample and len(logits_warper) > 0:
for i in range(candidate_length + 1): for i in range(candidate_length + 1):
new_logits[:, i, :] = logits_warper(candidate_input_ids[:, : cur_len + i], new_logits[:, i, :]) new_logits[:, i, :] = logits_warper(candidate_input_ids[:, : cur_len + i], new_logits[:, i, :])
......
...@@ -1650,8 +1650,6 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel): ...@@ -1650,8 +1650,6 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel):
batch_size = input_ids.shape[0] // self.num_codebooks batch_size = input_ids.shape[0] // self.num_codebooks
# 4. Define other model kwargs # 4. Define other model kwargs
model_kwargs["output_attentions"] = generation_config.output_attentions
model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
model_kwargs["use_cache"] = generation_config.use_cache model_kwargs["use_cache"] = generation_config.use_cache
model_kwargs["guidance_scale"] = generation_config.guidance_scale model_kwargs["guidance_scale"] = generation_config.guidance_scale
...@@ -1748,10 +1746,7 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel): ...@@ -1748,10 +1746,7 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel):
input_ids, input_ids,
logits_processor=logits_processor, logits_processor=logits_processor,
stopping_criteria=stopping_criteria, stopping_criteria=stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
eos_token_id=generation_config.eos_token_id,
output_scores=generation_config.output_scores,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
streamer=streamer, streamer=streamer,
**model_kwargs, **model_kwargs,
...@@ -1774,10 +1769,7 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel): ...@@ -1774,10 +1769,7 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel):
logits_processor=logits_processor, logits_processor=logits_processor,
logits_warper=logits_warper, logits_warper=logits_warper,
stopping_criteria=stopping_criteria, stopping_criteria=stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
eos_token_id=generation_config.eos_token_id,
output_scores=generation_config.output_scores,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
streamer=streamer, streamer=streamer,
**model_kwargs, **model_kwargs,
...@@ -2423,8 +2415,8 @@ class MusicgenForConditionalGeneration(PreTrainedModel): ...@@ -2423,8 +2415,8 @@ class MusicgenForConditionalGeneration(PreTrainedModel):
self, self,
inputs_tensor: torch.Tensor, inputs_tensor: torch.Tensor,
model_kwargs, model_kwargs,
model_input_name: Optional[str] = None, model_input_name: Optional[str],
guidance_scale: Optional[float] = None, generation_config: GenerationConfig,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
# 1. get text encoder # 1. get text encoder
encoder = self.get_text_encoder() encoder = self.get_text_encoder()
...@@ -2446,6 +2438,9 @@ class MusicgenForConditionalGeneration(PreTrainedModel): ...@@ -2446,6 +2438,9 @@ class MusicgenForConditionalGeneration(PreTrainedModel):
encoder_kwargs = { encoder_kwargs = {
argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
} }
encoder_kwargs["output_attentions"] = generation_config.output_attentions
encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states
guidance_scale = generation_config.guidance_scale
# 3. make sure that encoder returns `ModelOutput` # 3. make sure that encoder returns `ModelOutput`
model_input_name = model_input_name if model_input_name is not None else self.text_encoder.main_input_name model_input_name = model_input_name if model_input_name is not None else self.text_encoder.main_input_name
...@@ -2708,8 +2703,6 @@ class MusicgenForConditionalGeneration(PreTrainedModel): ...@@ -2708,8 +2703,6 @@ class MusicgenForConditionalGeneration(PreTrainedModel):
batch_size = inputs_tensor.shape[0] batch_size = inputs_tensor.shape[0]
# 4. Define other model kwargs # 4. Define other model kwargs
model_kwargs["output_attentions"] = generation_config.output_attentions
model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
model_kwargs["use_cache"] = generation_config.use_cache model_kwargs["use_cache"] = generation_config.use_cache
model_kwargs["guidance_scale"] = generation_config.guidance_scale model_kwargs["guidance_scale"] = generation_config.guidance_scale
...@@ -2723,10 +2716,7 @@ class MusicgenForConditionalGeneration(PreTrainedModel): ...@@ -2723,10 +2716,7 @@ class MusicgenForConditionalGeneration(PreTrainedModel):
if "encoder_outputs" not in model_kwargs: if "encoder_outputs" not in model_kwargs:
# encoder_outputs are created and added to `model_kwargs` # encoder_outputs are created and added to `model_kwargs`
model_kwargs = self._prepare_text_encoder_kwargs_for_generation( model_kwargs = self._prepare_text_encoder_kwargs_for_generation(
inputs_tensor, inputs_tensor, model_kwargs, model_input_name, generation_config
model_kwargs,
model_input_name,
guidance_scale=generation_config.guidance_scale,
) )
if "decoder_input_ids" not in model_kwargs and "input_values" in model_kwargs: if "decoder_input_ids" not in model_kwargs and "input_values" in model_kwargs:
...@@ -2831,10 +2821,7 @@ class MusicgenForConditionalGeneration(PreTrainedModel): ...@@ -2831,10 +2821,7 @@ class MusicgenForConditionalGeneration(PreTrainedModel):
input_ids, input_ids,
logits_processor=logits_processor, logits_processor=logits_processor,
stopping_criteria=stopping_criteria, stopping_criteria=stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
eos_token_id=generation_config.eos_token_id,
output_scores=generation_config.output_scores,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
streamer=streamer, streamer=streamer,
**model_kwargs, **model_kwargs,
...@@ -2858,10 +2845,7 @@ class MusicgenForConditionalGeneration(PreTrainedModel): ...@@ -2858,10 +2845,7 @@ class MusicgenForConditionalGeneration(PreTrainedModel):
logits_processor=logits_processor, logits_processor=logits_processor,
logits_warper=logits_warper, logits_warper=logits_warper,
stopping_criteria=stopping_criteria, stopping_criteria=stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
eos_token_id=generation_config.eos_token_id,
output_scores=generation_config.output_scores,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
streamer=streamer, streamer=streamer,
**model_kwargs, **model_kwargs,
......
...@@ -1586,8 +1586,6 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel): ...@@ -1586,8 +1586,6 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel):
batch_size = input_ids.shape[0] // self.num_codebooks batch_size = input_ids.shape[0] // self.num_codebooks
# 4. Define other model kwargs # 4. Define other model kwargs
model_kwargs["output_attentions"] = generation_config.output_attentions
model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
model_kwargs["use_cache"] = generation_config.use_cache model_kwargs["use_cache"] = generation_config.use_cache
model_kwargs["guidance_scale"] = generation_config.guidance_scale model_kwargs["guidance_scale"] = generation_config.guidance_scale
...@@ -1684,10 +1682,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel): ...@@ -1684,10 +1682,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel):
input_ids, input_ids,
logits_processor=logits_processor, logits_processor=logits_processor,
stopping_criteria=stopping_criteria, stopping_criteria=stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
eos_token_id=generation_config.eos_token_id,
output_scores=generation_config.output_scores,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
streamer=streamer, streamer=streamer,
**model_kwargs, **model_kwargs,
...@@ -1710,10 +1705,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel): ...@@ -1710,10 +1705,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel):
logits_processor=logits_processor, logits_processor=logits_processor,
logits_warper=logits_warper, logits_warper=logits_warper,
stopping_criteria=stopping_criteria, stopping_criteria=stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
eos_token_id=generation_config.eos_token_id,
output_scores=generation_config.output_scores,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
streamer=streamer, streamer=streamer,
**model_kwargs, **model_kwargs,
...@@ -2318,12 +2310,13 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel): ...@@ -2318,12 +2310,13 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel):
self, self,
inputs_tensor: torch.Tensor, inputs_tensor: torch.Tensor,
model_kwargs, model_kwargs,
model_input_name: Optional[str] = None, model_input_name: Optional[str],
guidance_scale: Optional[float] = None, generation_config: GenerationConfig,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
encoder_hidden_states = None encoder_hidden_states = None
# attention mask is consumed once to produce text conditional hidden states through the text encoder # attention mask is consumed once to produce text conditional hidden states through the text encoder
encoder_attention_mask = model_kwargs.pop("attention_mask") encoder_attention_mask = model_kwargs.pop("attention_mask")
guidance_scale = generation_config.guidance_scale
# 1. condition on text # 1. condition on text
if inputs_tensor is not None: if inputs_tensor is not None:
...@@ -2346,6 +2339,8 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel): ...@@ -2346,6 +2339,8 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel):
encoder_kwargs = { encoder_kwargs = {
argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature
} }
encoder_kwargs["output_attentions"] = generation_config.output_attentions
encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states
# make sure that encoder returns `ModelOutput` # make sure that encoder returns `ModelOutput`
model_input_name = model_input_name if model_input_name is not None else self.text_encoder.main_input_name model_input_name = model_input_name if model_input_name is not None else self.text_encoder.main_input_name
...@@ -2572,8 +2567,6 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel): ...@@ -2572,8 +2567,6 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel):
batch_size = inputs_tensor.shape[0] batch_size = inputs_tensor.shape[0]
# 4. Define other model kwargs # 4. Define other model kwargs
model_kwargs["output_attentions"] = generation_config.output_attentions
model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
model_kwargs["use_cache"] = generation_config.use_cache model_kwargs["use_cache"] = generation_config.use_cache
model_kwargs["guidance_scale"] = generation_config.guidance_scale model_kwargs["guidance_scale"] = generation_config.guidance_scale
...@@ -2585,10 +2578,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel): ...@@ -2585,10 +2578,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel):
if "encoder_hidden_states" not in model_kwargs: if "encoder_hidden_states" not in model_kwargs:
# encoder_hidden_states are created and added to `model_kwargs` # encoder_hidden_states are created and added to `model_kwargs`
model_kwargs = self._prepare_encoder_hidden_states_kwargs_for_generation( model_kwargs = self._prepare_encoder_hidden_states_kwargs_for_generation(
inputs_tensor, inputs_tensor, model_kwargs, model_input_name, generation_config
model_kwargs,
model_input_name,
guidance_scale=generation_config.guidance_scale,
) )
# 5. Prepare `input_ids` which will be used for auto-regressive generation # 5. Prepare `input_ids` which will be used for auto-regressive generation
...@@ -2684,14 +2674,11 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel): ...@@ -2684,14 +2674,11 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel):
) )
# 11. run greedy search # 11. run greedy search
outputs = self.greedy_search( outputs = self._greedy_search(
input_ids, input_ids,
logits_processor=logits_processor, logits_processor=logits_processor,
stopping_criteria=stopping_criteria, stopping_criteria=stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
eos_token_id=generation_config.eos_token_id,
output_scores=generation_config.output_scores,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
streamer=streamer, streamer=streamer,
**model_kwargs, **model_kwargs,
...@@ -2710,15 +2697,12 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel): ...@@ -2710,15 +2697,12 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel):
) )
# 12. run sample # 12. run sample
outputs = self.sample( outputs = self._sample(
input_ids, input_ids,
logits_processor=logits_processor, logits_processor=logits_processor,
logits_warper=logits_warper, logits_warper=logits_warper,
stopping_criteria=stopping_criteria, stopping_criteria=stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
eos_token_id=generation_config.eos_token_id,
output_scores=generation_config.output_scores,
return_dict_in_generate=generation_config.return_dict_in_generate,
synced_gpus=synced_gpus, synced_gpus=synced_gpus,
streamer=streamer, streamer=streamer,
**model_kwargs, **model_kwargs,
......
...@@ -1537,6 +1537,10 @@ class RagTokenForGeneration(RagPreTrainedModel): ...@@ -1537,6 +1537,10 @@ class RagTokenForGeneration(RagPreTrainedModel):
logits_processor=logits_processor, logits_processor=logits_processor,
) )
prepared_stopping_criteria = self._get_stopping_criteria(
generation_config=generation_config, stopping_criteria=stopping_criteria
)
if generation_config.num_beams == 1: if generation_config.num_beams == 1:
if generation_config.num_return_sequences > 1: if generation_config.num_return_sequences > 1:
raise ValueError( raise ValueError(
...@@ -1546,9 +1550,10 @@ class RagTokenForGeneration(RagPreTrainedModel): ...@@ -1546,9 +1550,10 @@ class RagTokenForGeneration(RagPreTrainedModel):
return self._greedy_search( return self._greedy_search(
input_ids, input_ids,
logits_processor=pre_processor, logits_processor=pre_processor,
max_length=generation_config.max_length, stopping_criteria=prepared_stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
eos_token_id=generation_config.eos_token_id, synced_gpus=False,
streamer=None,
**model_kwargs, **model_kwargs,
) )
elif generation_config.num_beams > 1: elif generation_config.num_beams > 1:
...@@ -1567,9 +1572,9 @@ class RagTokenForGeneration(RagPreTrainedModel): ...@@ -1567,9 +1572,9 @@ class RagTokenForGeneration(RagPreTrainedModel):
input_ids, input_ids,
beam_scorer, beam_scorer,
logits_processor=pre_processor, logits_processor=pre_processor,
max_length=generation_config.max_length, stopping_criteria=prepared_stopping_criteria,
pad_token_id=generation_config.pad_token_id, generation_config=generation_config,
eos_token_id=generation_config.eos_token_id, synced_gpus=False,
**model_kwargs, **model_kwargs,
) )
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment