"docs/vscode:/vscode.git/clone" did not exist on "dc9147ff362e4e69829f64d28178c77cab4bef6f"
Unverified Commit b5e2b183 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Doc styler examples (#14953)

* Fix bad examples

* Add black formatting to style_doc

* Use first nonempty line

* Put it at the right place

* Don't add spaces to empty lines

* Better templates

* Deal with triple quotes in docstrings

* Result of style_doc

* Enable mdx treatment and fix code examples in MDXs

* Result of doc styler on doc source files

* Last fixes

* Break copy from
parent e13f72fb
...@@ -50,11 +50,12 @@ Here is an example of model usage: ...@@ -50,11 +50,12 @@ Here is an example of model usage:
```python ```python
>>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
>>> mname = 'facebook/blenderbot-400M-distill'
>>> mname = "facebook/blenderbot-400M-distill"
>>> model = BlenderbotForConditionalGeneration.from_pretrained(mname) >>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
>>> tokenizer = BlenderbotTokenizer.from_pretrained(mname) >>> tokenizer = BlenderbotTokenizer.from_pretrained(mname)
>>> UTTERANCE = "My friends are cool but they eat too many carbs." >>> UTTERANCE = "My friends are cool but they eat too many carbs."
>>> inputs = tokenizer([UTTERANCE], return_tensors='pt') >>> inputs = tokenizer([UTTERANCE], return_tensors="pt")
>>> reply_ids = model.generate(**inputs) >>> reply_ids = model.generate(**inputs)
>>> print(tokenizer.batch_decode(reply_ids)) >>> print(tokenizer.batch_decode(reply_ids))
["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"] ["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]
......
...@@ -51,10 +51,12 @@ ByT5 works on raw UTF-8 bytes, so it can be used without a tokenizer: ...@@ -51,10 +51,12 @@ ByT5 works on raw UTF-8 bytes, so it can be used without a tokenizer:
from transformers import T5ForConditionalGeneration from transformers import T5ForConditionalGeneration
import torch import torch
model = T5ForConditionalGeneration.from_pretrained('google/byt5-small') model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
input_ids = torch.tensor([list("Life is like a box of chocolates.".encode("utf-8"))]) + 3 # add 3 for special tokens input_ids = torch.tensor([list("Life is like a box of chocolates.".encode("utf-8"))]) + 3 # add 3 for special tokens
labels = torch.tensor([list("La vie est comme une boîte de chocolat.".encode("utf-8"))]) + 3 # add 3 for special tokens labels = (
torch.tensor([list("La vie est comme une boîte de chocolat.".encode("utf-8"))]) + 3
) # add 3 for special tokens
loss = model(input_ids, labels=labels).loss # forward pass loss = model(input_ids, labels=labels).loss # forward pass
``` ```
...@@ -64,11 +66,15 @@ For batched inference and training it is however recommended to make use of the ...@@ -64,11 +66,15 @@ For batched inference and training it is however recommended to make use of the
```python ```python
from transformers import T5ForConditionalGeneration, AutoTokenizer from transformers import T5ForConditionalGeneration, AutoTokenizer
model = T5ForConditionalGeneration.from_pretrained('google/byt5-small') model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
tokenizer = AutoTokenizer.from_pretrained('google/byt5-small') tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
model_inputs = tokenizer(["Life is like a box of chocolates.", "Today is Monday."], padding="longest", return_tensors="pt") model_inputs = tokenizer(
labels = tokenizer(["La vie est comme une boîte de chocolat.", "Aujourd'hui c'est lundi."], padding="longest", return_tensors="pt").input_ids ["Life is like a box of chocolates.", "Today is Monday."], padding="longest", return_tensors="pt"
)
labels = tokenizer(
["La vie est comme une boîte de chocolat.", "Aujourd'hui c'est lundi."], padding="longest", return_tensors="pt"
).input_ids
loss = model(**model_inputs, labels=labels).loss # forward pass loss = model(**model_inputs, labels=labels).loss # forward pass
``` ```
......
...@@ -64,7 +64,7 @@ CANINE works on raw characters, so it can be used without a tokenizer: ...@@ -64,7 +64,7 @@ CANINE works on raw characters, so it can be used without a tokenizer:
>>> from transformers import CanineModel >>> from transformers import CanineModel
>>> import torch >>> import torch
>>> model = CanineModel.from_pretrained('google/canine-c') # model pre-trained with autoregressive character loss >>> model = CanineModel.from_pretrained("google/canine-c") # model pre-trained with autoregressive character loss
>>> text = "hello world" >>> text = "hello world"
>>> # use Python's built-in ord() function to turn each character into its unicode code point id >>> # use Python's built-in ord() function to turn each character into its unicode code point id
...@@ -81,8 +81,8 @@ sequences to the same length): ...@@ -81,8 +81,8 @@ sequences to the same length):
```python ```python
>>> from transformers import CanineTokenizer, CanineModel >>> from transformers import CanineTokenizer, CanineModel
>>> model = CanineModel.from_pretrained('google/canine-c') >>> model = CanineModel.from_pretrained("google/canine-c")
>>> tokenizer = CanineTokenizer.from_pretrained('google/canine-c') >>> tokenizer = CanineTokenizer.from_pretrained("google/canine-c")
>>> inputs = ["Life is like a box of chocolates.", "You never know what you gonna get."] >>> inputs = ["Life is like a box of chocolates.", "You never know what you gonna get."]
>>> encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt") >>> encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt")
......
...@@ -29,16 +29,24 @@ The `generate()` method can be used to generate text using GPT Neo model. ...@@ -29,16 +29,24 @@ The `generate()` method can be used to generate text using GPT Neo model.
```python ```python
>>> from transformers import GPTNeoForCausalLM, GPT2Tokenizer >>> from transformers import GPTNeoForCausalLM, GPT2Tokenizer
>>> model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B") >>> model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
>>> tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B") >>> tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
>>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \ >>> prompt = (
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \ ... "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
... "researchers was the fact that the unicorns spoke perfect English." ... "researchers was the fact that the unicorns spoke perfect English."
... )
>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
>>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,) >>> gen_tokens = model.generate(
... input_ids,
... do_sample=True,
... temperature=0.9,
... max_length=100,
... )
>>> gen_text = tokenizer.batch_decode(gen_tokens)[0] >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
``` ```
......
...@@ -33,7 +33,9 @@ Tips: ...@@ -33,7 +33,9 @@ Tips:
>>> from transformers import GPTJForCausalLM >>> from transformers import GPTJForCausalLM
>>> import torch >>> import torch
>>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True) >>> model = GPTJForCausalLM.from_pretrained(
... "EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True
... )
``` ```
- The model should fit on 16GB GPU for inference. For training/fine-tuning it would take much more GPU RAM. Adam - The model should fit on 16GB GPU for inference. For training/fine-tuning it would take much more GPU RAM. Adam
...@@ -56,16 +58,24 @@ model. ...@@ -56,16 +58,24 @@ model.
```python ```python
>>> from transformers import AutoModelForCausalLM, AutoTokenizer >>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B") >>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
>>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
>>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \ >>> prompt = (
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \ ... "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
... "researchers was the fact that the unicorns spoke perfect English." ... "researchers was the fact that the unicorns spoke perfect English."
... )
>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
>>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,) >>> gen_tokens = model.generate(
... input_ids,
... do_sample=True,
... temperature=0.9,
... max_length=100,
... )
>>> gen_text = tokenizer.batch_decode(gen_tokens)[0] >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
``` ```
...@@ -78,13 +88,20 @@ model. ...@@ -78,13 +88,20 @@ model.
>>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16) >>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
>>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
>>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \ >>> prompt = (
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \ ... "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
... "researchers was the fact that the unicorns spoke perfect English." ... "researchers was the fact that the unicorns spoke perfect English."
... )
>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
>>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,) >>> gen_tokens = model.generate(
... input_ids,
... do_sample=True,
... temperature=0.9,
... max_length=100,
... )
>>> gen_text = tokenizer.batch_decode(gen_tokens)[0] >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
``` ```
......
...@@ -41,7 +41,7 @@ Examples of use: ...@@ -41,7 +41,7 @@ Examples of use:
>>> tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1") >>> tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
>>> model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1") >>> model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
>>> encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt') >>> encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors="pt")
>>> outputs = model(encoded_input) >>> outputs = model(encoded_input)
>>> # HerBERT can also be loaded using AutoTokenizer and AutoModel: >>> # HerBERT can also be loaded using AutoTokenizer and AutoModel:
......
...@@ -158,7 +158,9 @@ from PIL import Image ...@@ -158,7 +158,9 @@ from PIL import Image
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased") processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB") image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
encoding = processor(image, return_tensors="pt") # you can also add all tokenizer parameters here such as padding, truncation encoding = processor(
image, return_tensors="pt"
) # you can also add all tokenizer parameters here such as padding, truncation
print(encoding.keys()) print(encoding.keys())
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image']) # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
``` ```
......
...@@ -34,7 +34,7 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like ...@@ -34,7 +34,7 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like
```python ```python
from transformers import LayoutLMv2Model from transformers import LayoutLMv2Model
model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base') model = LayoutLMv2Model.from_pretrained("microsoft/layoutxlm-base")
``` ```
Note that LayoutXLM has its own tokenizer, based on Note that LayoutXLM has its own tokenizer, based on
...@@ -44,7 +44,7 @@ follows: ...@@ -44,7 +44,7 @@ follows:
```python ```python
from transformers import LayoutXLMTokenizer from transformers import LayoutXLMTokenizer
tokenizer = LayoutXLMTokenizer.from_pretrained('microsoft/layoutxlm-base') tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
``` ```
Similar to LayoutLMv2, you can use [`LayoutXLMProcessor`] (which internally applies Similar to LayoutLMv2, you can use [`LayoutXLMProcessor`] (which internally applies
......
...@@ -75,8 +75,8 @@ For more information, please refer to the official [paper](https://arxiv.org/pdf ...@@ -75,8 +75,8 @@ For more information, please refer to the official [paper](https://arxiv.org/pdf
trained and should be used as follows: trained and should be used as follows:
```python ```python
input_ids = tokenizer.encode('This is a sentence from [MASK] training data', return_tensors='pt') input_ids = tokenizer.encode("This is a sentence from [MASK] training data", return_tensors="pt")
mlm_labels = tokenizer.encode('This is a sentence from the training data', return_tensors='pt') mlm_labels = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0] loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
``` ```
......
...@@ -84,24 +84,27 @@ Example: ...@@ -84,24 +84,27 @@ Example:
>>> model = LukeModel.from_pretrained("studio-ousia/luke-base") >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
>>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base") >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
# Example 1: Computing the contextualized entity representation corresponding to the entity mention "Beyoncé" # Example 1: Computing the contextualized entity representation corresponding to the entity mention "Beyoncé"
>>> text = "Beyoncé lives in Los Angeles." >>> text = "Beyoncé lives in Los Angeles."
>>> entity_spans = [(0, 7)] # character-based entity span corresponding to "Beyoncé" >>> entity_spans = [(0, 7)] # character-based entity span corresponding to "Beyoncé"
>>> inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt") >>> inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> word_last_hidden_state = outputs.last_hidden_state >>> word_last_hidden_state = outputs.last_hidden_state
>>> entity_last_hidden_state = outputs.entity_last_hidden_state >>> entity_last_hidden_state = outputs.entity_last_hidden_state
# Example 2: Inputting Wikipedia entities to obtain enriched contextualized representations # Example 2: Inputting Wikipedia entities to obtain enriched contextualized representations
>>> entities = ["Beyoncé", "Los Angeles"] # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
>>> entities = [
... "Beyoncé",
... "Los Angeles",
>>> ] # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
>>> entity_spans = [(0, 7), (17, 28)] # character-based entity spans corresponding to "Beyoncé" and "Los Angeles" >>> entity_spans = [(0, 7), (17, 28)] # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
>>> inputs = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt") >>> inputs = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> word_last_hidden_state = outputs.last_hidden_state >>> word_last_hidden_state = outputs.last_hidden_state
>>> entity_last_hidden_state = outputs.entity_last_hidden_state >>> entity_last_hidden_state = outputs.entity_last_hidden_state
# Example 3: Classifying the relationship between two entities using LukeForEntityPairClassification head model # Example 3: Classifying the relationship between two entities using LukeForEntityPairClassification head model
>>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred") >>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
>>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred") >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
>>> entity_spans = [(0, 7), (17, 28)] # character-based entity spans corresponding to "Beyoncé" and "Los Angeles" >>> entity_spans = [(0, 7), (17, 28)] # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
......
...@@ -49,8 +49,8 @@ examples. To install `sentencepiece` run `pip install sentencepiece`. ...@@ -49,8 +49,8 @@ examples. To install `sentencepiece` run `pip install sentencepiece`.
```python ```python
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
model = M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M') model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M', src_lang="en", tgt_lang="fr") tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="fr")
src_text = "Life is like a box of chocolates." src_text = "Life is like a box of chocolates."
tgt_text = "La vie est comme une boîte de chocolat." tgt_text = "La vie est comme une boîte de chocolat."
......
...@@ -65,13 +65,14 @@ require 3 character language codes: ...@@ -65,13 +65,14 @@ require 3 character language codes:
```python ```python
>>> from transformers import MarianMTModel, MarianTokenizer >>> from transformers import MarianMTModel, MarianTokenizer
>>> src_text = [ >>> src_text = [
... '>>fra<< this is a sentence in english that we want to translate to french', ... ">>fra<< this is a sentence in english that we want to translate to french",
... '>>por<< This should go to portuguese', ... ">>por<< This should go to portuguese",
... '>>esp<< And this to Spanish' ... ">>esp<< And this to Spanish",
>>> ] ... ]
>>> model_name = 'Helsinki-NLP/opus-mt-en-roa' >>> model_name = "Helsinki-NLP/opus-mt-en-roa"
>>> tokenizer = MarianTokenizer.from_pretrained(model_name) >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
>>> print(tokenizer.supported_language_codes) >>> print(tokenizer.supported_language_codes)
['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<'] ['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']
...@@ -88,11 +89,12 @@ Here is the code to see all available pretrained models on the hub: ...@@ -88,11 +89,12 @@ Here is the code to see all available pretrained models on the hub:
```python ```python
from huggingface_hub import list_models from huggingface_hub import list_models
model_list = list_models() model_list = list_models()
org = "Helsinki-NLP" org = "Helsinki-NLP"
model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)] model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
suffix = [x.split('/')[1] for x in model_ids] suffix = [x.split("/")[1] for x in model_ids]
old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()] old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()]
``` ```
## Old Style Multi-Lingual Models ## Old Style Multi-Lingual Models
...@@ -100,7 +102,7 @@ old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()] ...@@ -100,7 +102,7 @@ old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language
group: group:
```python ```python no-style
['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU', ['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU',
'Helsinki-NLP/opus-mt-ROMANCE-en', 'Helsinki-NLP/opus-mt-ROMANCE-en',
'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA', 'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA',
...@@ -129,13 +131,14 @@ Example of translating english to many romance languages, using old-style 2 char ...@@ -129,13 +131,14 @@ Example of translating english to many romance languages, using old-style 2 char
```python ```python
>>> from transformers import MarianMTModel, MarianTokenizer >>> from transformers import MarianMTModel, MarianTokenizer
>>> src_text = [ >>> src_text = [
... '>>fr<< this is a sentence in english that we want to translate to french', ... ">>fr<< this is a sentence in english that we want to translate to french",
... '>>pt<< This should go to portuguese', ... ">>pt<< This should go to portuguese",
... '>>es<< And this to Spanish' ... ">>es<< And this to Spanish",
>>> ] ... ]
>>> model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE' >>> model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
>>> tokenizer = MarianTokenizer.from_pretrained(model_name) >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
>>> model = MarianMTModel.from_pretrained(model_name) >>> model = MarianMTModel.from_pretrained(model_name)
......
...@@ -52,7 +52,7 @@ inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode tar ...@@ -52,7 +52,7 @@ inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode tar
>>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro") >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
>>> # forward pass >>> # forward pass
>>> model(**inputs, labels=batch['labels']) >>> model(**inputs, labels=batch["labels"])
``` ```
- Generation - Generation
......
...@@ -38,7 +38,7 @@ One can directly plug in the weights of mLUKE into a LUKE model, like so: ...@@ -38,7 +38,7 @@ One can directly plug in the weights of mLUKE into a LUKE model, like so:
```python ```python
from transformers import LukeModel from transformers import LukeModel
model = LukeModel.from_pretrained('studio-ousia/mluke-base') model = LukeModel.from_pretrained("studio-ousia/mluke-base")
``` ```
Note that mLUKE has its own tokenizer, [`MLukeTokenizer`]. You can initialize it as follows: Note that mLUKE has its own tokenizer, [`MLukeTokenizer`]. You can initialize it as follows:
...@@ -46,7 +46,7 @@ Note that mLUKE has its own tokenizer, [`MLukeTokenizer`]. You can initialize it ...@@ -46,7 +46,7 @@ Note that mLUKE has its own tokenizer, [`MLukeTokenizer`]. You can initialize it
```python ```python
from transformers import MLukeTokenizer from transformers import MLukeTokenizer
tokenizer = MLukeTokenizer.from_pretrained('studio-ousia/mluke-base') tokenizer = MLukeTokenizer.from_pretrained("studio-ousia/mluke-base")
``` ```
As mLUKE's architecture is equivalent to that of LUKE, one can refer to [LUKE's documentation page](luke) for all As mLUKE's architecture is equivalent to that of LUKE, one can refer to [LUKE's documentation page](luke) for all
......
...@@ -69,18 +69,22 @@ All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tun ...@@ -69,18 +69,22 @@ All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tun
```python ```python
>>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer >>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer
>>> import torch >>> import torch
>>> src_text = [ >>> src_text = [
... """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""" ... """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
>>> ] ... ]
>>> model_name = 'google/pegasus-xsum' ... model_name = "google/pegasus-xsum"
>>> device = 'cuda' if torch.cuda.is_available() else 'cpu' ... device = "cuda" if torch.cuda.is_available() else "cpu"
>>> tokenizer = PegasusTokenizer.from_pretrained(model_name) ... tokenizer = PegasusTokenizer.from_pretrained(model_name)
>>> model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device) ... model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
>>> batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device) ... batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device)
>>> translated = model.generate(**batch) ... translated = model.generate(**batch)
>>> tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) ... tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
>>> assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers." ... assert (
... tgt_text[0]
... == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
... )
``` ```
## PegasusConfig ## PegasusConfig
......
...@@ -75,9 +75,9 @@ tensors. After setting up the tensor quantizers, one can use the following examp ...@@ -75,9 +75,9 @@ tensors. After setting up the tensor quantizers, one can use the following examp
```python ```python
>>> # Find the TensorQuantizer and enable calibration >>> # Find the TensorQuantizer and enable calibration
>>> for name, module in model.named_modules(): >>> for name, module in model.named_modules():
>>> if name.endswith('_input_quantizer'): ... if name.endswith("_input_quantizer"):
>>> module.enable_calib() ... module.enable_calib()
>>> module.disable_quant() # Use full precision data to calibrate ... module.disable_quant() # Use full precision data to calibrate
>>> # Feeding data samples >>> # Feeding data samples
>>> model(x) >>> model(x)
...@@ -85,9 +85,9 @@ tensors. After setting up the tensor quantizers, one can use the following examp ...@@ -85,9 +85,9 @@ tensors. After setting up the tensor quantizers, one can use the following examp
>>> # Finalize calibration >>> # Finalize calibration
>>> for name, module in model.named_modules(): >>> for name, module in model.named_modules():
>>> if name.endswith('_input_quantizer'): ... if name.endswith("_input_quantizer"):
>>> module.load_calib_amax() ... module.load_calib_amax()
>>> module.enable_quant() ... module.enable_quant()
>>> # If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process >>> # If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process
>>> model.cuda() >>> model.cuda()
...@@ -105,6 +105,7 @@ the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Exa ...@@ -105,6 +105,7 @@ the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Exa
```python ```python
>>> from pytorch_quantization.nn import TensorQuantizer >>> from pytorch_quantization.nn import TensorQuantizer
>>> TensorQuantizer.use_fb_fake_quant = True >>> TensorQuantizer.use_fb_fake_quant = True
>>> # Load the calibrated model >>> # Load the calibrated model
......
...@@ -134,7 +134,7 @@ easily be trained on sequences as long as 64000 tokens. ...@@ -134,7 +134,7 @@ easily be trained on sequences as long as 64000 tokens.
For training, the [`ReformerModelWithLMHead`] should be used as follows: For training, the [`ReformerModelWithLMHead`] should be used as follows:
```python ```python
input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt') input_ids = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
loss = model(input_ids, labels=input_ids)[0] loss = model(input_ids, labels=input_ids)[0]
``` ```
......
...@@ -52,11 +52,13 @@ be installed as follows: `apt install libsndfile1-dev` ...@@ -52,11 +52,13 @@ be installed as follows: `apt install libsndfile1-dev`
>>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr") >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
>>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr") >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech ... batch["speech"] = speech
... return batch ... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
...@@ -83,16 +85,22 @@ be installed as follows: `apt install libsndfile1-dev` ...@@ -83,16 +85,22 @@ be installed as follows: `apt install libsndfile1-dev`
>>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st") >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
>>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st") >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech ... batch["speech"] = speech
... return batch ... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
>>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt") >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
>>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask], forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"]) >>> generated_ids = model.generate(
... input_ids=inputs["input_features"],
... attention_mask=inputs["attention_mask"],
... forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"],
... )
>>> translation = processor.batch_decode(generated_ids) >>> translation = processor.batch_decode(generated_ids)
``` ```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment