"...git@developer.sourcefind.cn:OpenDAS/pyg_autoscale.git" did not exist on "dcce414c2af0b48f2232806cdd6551c2652d21d8"
Unverified Commit 986526a0 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Replace `as_target` context managers by direct calls (#18325)



* Preliminary work on tokenizers

* Quality + fix tests

* Treat processors

* Fix pad

* Remove all uses of  in tests, docs and examples

* Replace all as_target_tokenizer

* Fix tests

* Fix quality

* Update examples/flax/image-captioning/run_image_captioning_flax.py
Co-authored-by: default avataramyeroberts <amy@huggingface.co>

* Style
Co-authored-by: default avataramyeroberts <amy@huggingface.co>
parent a64bcb56
...@@ -55,9 +55,7 @@ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en ...@@ -55,9 +55,7 @@ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en
src_text = "Life is like a box of chocolates." src_text = "Life is like a box of chocolates."
tgt_text = "La vie est comme une boîte de chocolat." tgt_text = "La vie est comme une boîte de chocolat."
model_inputs = tokenizer(src_text, return_tensors="pt") model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
with tokenizer.as_target_tokenizer():
labels = tokenizer(tgt_text, return_tensors="pt").input_ids
loss = model(**model_inputs, labels=labels) # forward pass loss = model(**model_inputs, labels=labels) # forward pass
``` ```
......
...@@ -155,7 +155,7 @@ Example of translating english to many romance languages, using old-style 2 char ...@@ -155,7 +155,7 @@ Example of translating english to many romance languages, using old-style 2 char
## MarianTokenizer ## MarianTokenizer
[[autodoc]] MarianTokenizer [[autodoc]] MarianTokenizer
- as_target_tokenizer - build_inputs_with_special_tokens
## MarianModel ## MarianModel
......
...@@ -34,8 +34,8 @@ model is multilingual it expects the sequences in a different format. A special ...@@ -34,8 +34,8 @@ model is multilingual it expects the sequences in a different format. A special
source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The
target text format is `[tgt_lang_code] X [eos]`. `bos` is never used. target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
The regular [`~MBartTokenizer.__call__`] will encode source text format, and it should be wrapped The regular [`~MBartTokenizer.__call__`] will encode source text format passed as first argument or with the `text`
inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode target text format. keyword, and target text format passed with the `text_label` keyword argument.
- Supervised training - Supervised training
...@@ -46,13 +46,11 @@ inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode tar ...@@ -46,13 +46,11 @@ inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode tar
>>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria" >>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria" >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> inputs = tokenizer(example_english_phrase, return_tensors="pt") >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(expected_translation_romanian, return_tensors="pt")
>>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro") >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
>>> # forward pass >>> # forward pass
>>> model(**inputs, labels=batch["labels"]) >>> model(**inputs)
``` ```
- Generation - Generation
...@@ -108,11 +106,9 @@ tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_ ...@@ -108,11 +106,9 @@ tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_
src_text = " UN Chief Says There Is No Military Solution in Syria" src_text = " UN Chief Says There Is No Military Solution in Syria"
tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
model_inputs = tokenizer(src_text, return_tensors="pt") model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
with tokenizer.as_target_tokenizer():
labels = tokenizer(tgt_text, return_tensors="pt").input_ids
model(**model_inputs, labels=labels) # forward pass model(**model_inputs) # forward pass
``` ```
- Generation - Generation
...@@ -154,7 +150,6 @@ tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) ...@@ -154,7 +150,6 @@ tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
## MBartTokenizer ## MBartTokenizer
[[autodoc]] MBartTokenizer [[autodoc]] MBartTokenizer
- as_target_tokenizer
- build_inputs_with_special_tokens - build_inputs_with_special_tokens
## MBartTokenizerFast ## MBartTokenizerFast
......
...@@ -48,7 +48,6 @@ This model was contributed by [cwkeam](https://huggingface.co/cwkeam). The origi ...@@ -48,7 +48,6 @@ This model was contributed by [cwkeam](https://huggingface.co/cwkeam). The origi
- save_pretrained - save_pretrained
- batch_decode - batch_decode
- decode - decode
- as_target_processor
## MCTCTModel ## MCTCTModel
......
...@@ -91,7 +91,6 @@ UN-Chef sagt, es gibt keine militärische Lösung in Syrien ...@@ -91,7 +91,6 @@ UN-Chef sagt, es gibt keine militärische Lösung in Syrien
## NllbTokenizer ## NllbTokenizer
[[autodoc]] NllbTokenizer [[autodoc]] NllbTokenizer
- as_target_tokenizer
- build_inputs_with_special_tokens - build_inputs_with_special_tokens
## NllbTokenizerFast ## NllbTokenizerFast
......
...@@ -45,8 +45,9 @@ target text format is `[tgt_lang_code] X [eos]`. `bos` is never used. ...@@ -45,8 +45,9 @@ target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
However, for fine-tuning, in some cases no language token is provided in cases where a single language is used. Please refer to [the paper](https://arxiv.org/abs/2103.06333) to learn more about this. However, for fine-tuning, in some cases no language token is provided in cases where a single language is used. Please refer to [the paper](https://arxiv.org/abs/2103.06333) to learn more about this.
In cases where the language code is needed, The regular [`~PLBartTokenizer.__call__`] will encode source text format, and it should be wrapped In cases where the language code is needed, the regular [`~PLBartTokenizer.__call__`] will encode source text format
inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode target text format. when you pass texts as the first argument or with the keyword argument `text`, and will encode target text format if
it's passed with the `text_target` keyword argument.
- Supervised training - Supervised training
...@@ -56,11 +57,7 @@ inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode ta ...@@ -56,11 +57,7 @@ inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode ta
>>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base", src_lang="en_XX", tgt_lang="python") >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base", src_lang="en_XX", tgt_lang="python")
>>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])" >>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
>>> expected_translation_english = "Returns the maximum value of a b c." >>> expected_translation_english = "Returns the maximum value of a b c."
>>> inputs = tokenizer(example_python_phrase, return_tensors="pt") >>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(expected_translation_english, return_tensors="pt")
>>> inputs["labels"] = labels["input_ids"]
>>> # forward pass
>>> model(**inputs) >>> model(**inputs)
``` ```
...@@ -88,7 +85,6 @@ inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode ta ...@@ -88,7 +85,6 @@ inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode ta
## PLBartTokenizer ## PLBartTokenizer
[[autodoc]] PLBartTokenizer [[autodoc]] PLBartTokenizer
- as_target_tokenizer
- build_inputs_with_special_tokens - build_inputs_with_special_tokens
## PLBartModel ## PLBartModel
......
...@@ -107,7 +107,7 @@ speech inputs) and `labels` (which are the `input_ids` of the encoded target seq ...@@ -107,7 +107,7 @@ speech inputs) and `labels` (which are the `input_ids` of the encoded target seq
>>> labels = tokenizer(ds[0]["text"], return_tensors="pt").input_ids >>> labels = tokenizer(ds[0]["text"], return_tensors="pt").input_ids
>>> # the forward function automatically creates the correct decoder_input_ids >>> # the forward function automatically creates the correct decoder_input_ids
>>> loss = model(input_values, labels=labels).loss >>> loss = model(**input_features).loss
>>> loss.backward() >>> loss.backward()
``` ```
......
...@@ -120,7 +120,6 @@ See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look ...@@ -120,7 +120,6 @@ See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look
- save_pretrained - save_pretrained
- batch_decode - batch_decode
- decode - decode
- as_target_processor
## Speech2TextModel ## Speech2TextModel
......
...@@ -114,7 +114,6 @@ See [model hub](https://huggingface.co/models?filter=speech2text2) to look for S ...@@ -114,7 +114,6 @@ See [model hub](https://huggingface.co/models?filter=speech2text2) to look for S
- save_pretrained - save_pretrained
- batch_decode - batch_decode
- decode - decode
- as_target_processor
## Speech2Text2ForCausalLM ## Speech2Text2ForCausalLM
......
...@@ -94,7 +94,6 @@ See the [model hub](https://huggingface.co/models?filter=trocr) to look for TrOC ...@@ -94,7 +94,6 @@ See the [model hub](https://huggingface.co/models?filter=trocr) to look for TrOC
- save_pretrained - save_pretrained
- batch_decode - batch_decode
- decode - decode
- as_target_processor
## TrOCRForCausalLM ## TrOCRForCausalLM
......
...@@ -62,7 +62,6 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv ...@@ -62,7 +62,6 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
- save_pretrained - save_pretrained
- batch_decode - batch_decode
- decode - decode
- as_target_processor
## Wav2Vec2ProcessorWithLM ## Wav2Vec2ProcessorWithLM
...@@ -73,7 +72,6 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv ...@@ -73,7 +72,6 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
- save_pretrained - save_pretrained
- batch_decode - batch_decode
- decode - decode
- as_target_processor
## Wav2Vec2 specific outputs ## Wav2Vec2 specific outputs
......
...@@ -486,10 +486,8 @@ A processor combines a feature extractor and tokenizer. Load a processor with [` ...@@ -486,10 +486,8 @@ A processor combines a feature extractor and tokenizer. Load a processor with [`
>>> def prepare_dataset(example): >>> def prepare_dataset(example):
... audio = example["audio"] ... audio = example["audio"]
... example["input_values"] = processor(audio["array"], sampling_rate=16000) ... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
... with processor.as_target_processor():
... example["labels"] = processor(example["text"]).input_ids
... return example ... return example
``` ```
......
...@@ -109,11 +109,10 @@ The preprocessing function needs to: ...@@ -109,11 +109,10 @@ The preprocessing function needs to:
>>> def prepare_dataset(batch): >>> def prepare_dataset(batch):
... audio = batch["audio"] ... audio = batch["audio"]
... batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0] ... batch = processor(audio=audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
... batch["input_length"] = len(batch["input_values"]) ... batch["input_length"] = len(batch["input_values"])
... with processor.as_target_processor(): ... batch["labels"] = processor(text=batch["transcription"]).input_ids
... batch["labels"] = processor(batch["transcription"]).input_ids
... return batch ... return batch
``` ```
...@@ -146,17 +145,9 @@ Unlike other data collators, this specific data collator needs to apply a differ ...@@ -146,17 +145,9 @@ Unlike other data collators, this specific data collator needs to apply a differ
... input_features = [{"input_values": feature["input_values"]} for feature in features] ... input_features = [{"input_values": feature["input_values"]} for feature in features]
... label_features = [{"input_ids": feature["labels"]} for feature in features] ... label_features = [{"input_ids": feature["labels"]} for feature in features]
... batch = self.processor.pad( ... batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
... input_features,
... padding=self.padding, ... labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
... return_tensors="pt",
... )
... with self.processor.as_target_processor():
... labels_batch = self.processor.pad(
... label_features,
... padding=self.padding,
... return_tensors="pt",
... )
... # replace padding with -100 to ignore loss correctly ... # replace padding with -100 to ignore loss correctly
... labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) ... labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
......
...@@ -67,7 +67,7 @@ Load the T5 tokenizer to process `text` and `summary`: ...@@ -67,7 +67,7 @@ Load the T5 tokenizer to process `text` and `summary`:
The preprocessing function needs to: The preprocessing function needs to:
1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks. 1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks.
2. Use a context manager with the `as_target_tokenizer()` function to parallelize tokenization of inputs and labels. 2. Use the keyword `text_target` argument when tokenizing labels.
3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter. 3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter.
```py ```py
...@@ -78,8 +78,7 @@ The preprocessing function needs to: ...@@ -78,8 +78,7 @@ The preprocessing function needs to:
... inputs = [prefix + doc for doc in examples["text"]] ... inputs = [prefix + doc for doc in examples["text"]]
... model_inputs = tokenizer(inputs, max_length=1024, truncation=True) ... model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
... with tokenizer.as_target_tokenizer(): ... labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
... labels = tokenizer(examples["summary"], max_length=128, truncation=True)
... model_inputs["labels"] = labels["input_ids"] ... model_inputs["labels"] = labels["input_ids"]
... return model_inputs ... return model_inputs
......
...@@ -78,12 +78,7 @@ The preprocessing function needs to: ...@@ -78,12 +78,7 @@ The preprocessing function needs to:
>>> def preprocess_function(examples): >>> def preprocess_function(examples):
... inputs = [prefix + example[source_lang] for example in examples["translation"]] ... inputs = [prefix + example[source_lang] for example in examples["translation"]]
... targets = [example[target_lang] for example in examples["translation"]] ... targets = [example[target_lang] for example in examples["translation"]]
... model_inputs = tokenizer(inputs, max_length=128, truncation=True) ... model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
... with tokenizer.as_target_tokenizer():
... labels = tokenizer(targets, max_length=128, truncation=True)
... model_inputs["labels"] = labels["input_ids"]
... return model_inputs ... return model_inputs
``` ```
......
...@@ -471,10 +471,8 @@ Un processor combina un extractor de características y un tokenizador. Cargue u ...@@ -471,10 +471,8 @@ Un processor combina un extractor de características y un tokenizador. Cargue u
>>> def prepare_dataset(example): >>> def prepare_dataset(example):
... audio = example["audio"] ... audio = example["audio"]
... example["input_values"] = processor(audio["array"], sampling_rate=16000) ... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
... with processor.as_target_processor():
... example["labels"] = processor(example["text"]).input_ids
... return example ... return example
``` ```
......
...@@ -471,10 +471,8 @@ Un processor combina un estrattore di caratteristiche e un tokenizer. Carica un ...@@ -471,10 +471,8 @@ Un processor combina un estrattore di caratteristiche e un tokenizer. Carica un
>>> def prepare_dataset(example): >>> def prepare_dataset(example):
... audio = example["audio"] ... audio = example["audio"]
... example["input_values"] = processor(audio["array"], sampling_rate=16000) ... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
... with processor.as_target_processor():
... example["labels"] = processor(example["text"]).input_ids
... return example ... return example
``` ```
......
...@@ -552,10 +552,13 @@ def main(): ...@@ -552,10 +552,13 @@ def main():
targets = captions targets = captions
model_inputs = {} model_inputs = {}
# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer( labels = tokenizer(
targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np" text_target=targets,
max_length=max_target_length,
padding="max_length",
truncation=True,
return_tensors="np",
) )
model_inputs["labels"] = labels["input_ids"] model_inputs["labels"] = labels["input_ids"]
decoder_input_ids = shift_tokens_right_fn( decoder_input_ids = shift_tokens_right_fn(
......
...@@ -590,9 +590,12 @@ def main(): ...@@ -590,9 +590,12 @@ def main():
) )
# Setup the tokenizer for targets # Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer( labels = tokenizer(
targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np" text_target=targets,
max_length=max_target_length,
padding="max_length",
truncation=True,
return_tensors="np",
) )
model_inputs["labels"] = labels["input_ids"] model_inputs["labels"] = labels["input_ids"]
......
...@@ -453,9 +453,8 @@ def main(): ...@@ -453,9 +453,8 @@ def main():
inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column) inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column)
model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True) model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True)
# Setup the tokenizer for targets # Tokenize targets with text_target=...
with tokenizer.as_target_tokenizer(): labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True)
labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss. # padding in the loss.
...@@ -479,9 +478,8 @@ def main(): ...@@ -479,9 +478,8 @@ def main():
return_overflowing_tokens=True, return_overflowing_tokens=True,
return_offsets_mapping=True, return_offsets_mapping=True,
) )
# Setup the tokenizer for targets # Tokenize targets with the `text_target` keyword argument
with tokenizer.as_target_tokenizer(): labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True)
labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True)
# Since one example might give us several features if it has a long context, we need a map from a feature to # Since one example might give us several features if it has a long context, we need a map from a feature to
# its corresponding example. This key gives us just that. # its corresponding example. This key gives us just that.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment