Unverified Commit b5e2b183 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Doc styler examples (#14953)

* Fix bad examples

* Add black formatting to style_doc

* Use first nonempty line

* Put it at the right place

* Don't add spaces to empty lines

* Better templates

* Deal with triple quotes in docstrings

* Result of style_doc

* Enable mdx treatment and fix code examples in MDXs

* Result of doc styler on doc source files

* Last fixes

* Break copy from
parent e13f72fb
......@@ -50,11 +50,12 @@ Here is an example of model usage:
```python
>>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
>>> mname = 'facebook/blenderbot-400M-distill'
>>> mname = "facebook/blenderbot-400M-distill"
>>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
>>> tokenizer = BlenderbotTokenizer.from_pretrained(mname)
>>> UTTERANCE = "My friends are cool but they eat too many carbs."
>>> inputs = tokenizer([UTTERANCE], return_tensors='pt')
>>> inputs = tokenizer([UTTERANCE], return_tensors="pt")
>>> reply_ids = model.generate(**inputs)
>>> print(tokenizer.batch_decode(reply_ids))
["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]
......
......@@ -51,12 +51,14 @@ ByT5 works on raw UTF-8 bytes, so it can be used without a tokenizer:
from transformers import T5ForConditionalGeneration
import torch
model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
input_ids = torch.tensor([list("Life is like a box of chocolates.".encode("utf-8"))]) + 3 # add 3 for special tokens
labels = torch.tensor([list("La vie est comme une boîte de chocolat.".encode("utf-8"))]) + 3 # add 3 for special tokens
labels = (
torch.tensor([list("La vie est comme une boîte de chocolat.".encode("utf-8"))]) + 3
) # add 3 for special tokens
loss = model(input_ids, labels=labels).loss # forward pass
loss = model(input_ids, labels=labels).loss # forward pass
```
For batched inference and training it is however recommended to make use of the tokenizer:
......@@ -64,13 +66,17 @@ For batched inference and training it is however recommended to make use of the
```python
from transformers import T5ForConditionalGeneration, AutoTokenizer
model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')
tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
model_inputs = tokenizer(["Life is like a box of chocolates.", "Today is Monday."], padding="longest", return_tensors="pt")
labels = tokenizer(["La vie est comme une boîte de chocolat.", "Aujourd'hui c'est lundi."], padding="longest", return_tensors="pt").input_ids
model_inputs = tokenizer(
["Life is like a box of chocolates.", "Today is Monday."], padding="longest", return_tensors="pt"
)
labels = tokenizer(
["La vie est comme une boîte de chocolat.", "Aujourd'hui c'est lundi."], padding="longest", return_tensors="pt"
).input_ids
loss = model(**model_inputs, labels=labels).loss # forward pass
loss = model(**model_inputs, labels=labels).loss # forward pass
```
## ByT5Tokenizer
......
......@@ -64,13 +64,13 @@ CANINE works on raw characters, so it can be used without a tokenizer:
>>> from transformers import CanineModel
>>> import torch
>>> model = CanineModel.from_pretrained('google/canine-c') # model pre-trained with autoregressive character loss
>>> model = CanineModel.from_pretrained("google/canine-c") # model pre-trained with autoregressive character loss
>>> text = "hello world"
>>> # use Python's built-in ord() function to turn each character into its unicode code point id
>>> input_ids = torch.tensor([[ord(char) for char in text]])
>>> outputs = model(input_ids) # forward pass
>>> outputs = model(input_ids) # forward pass
>>> pooled_output = outputs.pooler_output
>>> sequence_output = outputs.last_hidden_state
```
......@@ -81,13 +81,13 @@ sequences to the same length):
```python
>>> from transformers import CanineTokenizer, CanineModel
>>> model = CanineModel.from_pretrained('google/canine-c')
>>> tokenizer = CanineTokenizer.from_pretrained('google/canine-c')
>>> model = CanineModel.from_pretrained("google/canine-c")
>>> tokenizer = CanineTokenizer.from_pretrained("google/canine-c")
>>> inputs = ["Life is like a box of chocolates.", "You never know what you gonna get."]
>>> encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt")
>>> outputs = model(**encoding) # forward pass
>>> outputs = model(**encoding) # forward pass
>>> pooled_output = outputs.pooler_output
>>> sequence_output = outputs.last_hidden_state
```
......
......@@ -69,8 +69,8 @@ encode the text and prepare the images. The following example shows how to get t
>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
```
This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/openai/CLIP).
......
......@@ -29,16 +29,24 @@ The `generate()` method can be used to generate text using GPT Neo model.
```python
>>> from transformers import GPTNeoForCausalLM, GPT2Tokenizer
>>> model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
>>> tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
>>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
... "researchers was the fact that the unicorns spoke perfect English."
>>> prompt = (
... "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
... "researchers was the fact that the unicorns spoke perfect English."
... )
>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
>>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
>>> gen_tokens = model.generate(
... input_ids,
... do_sample=True,
... temperature=0.9,
... max_length=100,
... )
>>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
```
......
......@@ -33,7 +33,9 @@ Tips:
>>> from transformers import GPTJForCausalLM
>>> import torch
>>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True)
>>> model = GPTJForCausalLM.from_pretrained(
... "EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True
... )
```
- The model should fit on 16GB GPU for inference. For training/fine-tuning it would take much more GPU RAM. Adam
......@@ -56,16 +58,24 @@ model.
```python
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
>>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
>>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
... "researchers was the fact that the unicorns spoke perfect English."
>>> prompt = (
... "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
... "researchers was the fact that the unicorns spoke perfect English."
... )
>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
>>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
>>> gen_tokens = model.generate(
... input_ids,
... do_sample=True,
... temperature=0.9,
... max_length=100,
... )
>>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
```
......@@ -78,13 +88,20 @@ model.
>>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
>>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
>>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
... "researchers was the fact that the unicorns spoke perfect English."
>>> prompt = (
... "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
... "researchers was the fact that the unicorns spoke perfect English."
... )
>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
>>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
>>> gen_tokens = model.generate(
... input_ids,
... do_sample=True,
... temperature=0.9,
... max_length=100,
... )
>>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
```
......
......@@ -41,7 +41,7 @@ Examples of use:
>>> tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
>>> model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
>>> encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
>>> encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors="pt")
>>> outputs = model(encoded_input)
>>> # HerBERT can also be loaded using AutoTokenizer and AutoModel:
......
......@@ -53,12 +53,12 @@ Tips:
```python
def normalize_bbox(bbox, width, height):
return [
int(1000 * (bbox[0] / width)),
int(1000 * (bbox[1] / height)),
int(1000 * (bbox[2] / width)),
int(1000 * (bbox[3] / height)),
]
return [
int(1000 * (bbox[0] / width)),
int(1000 * (bbox[1] / height)),
int(1000 * (bbox[2] / width)),
int(1000 * (bbox[3] / height)),
]
```
Here, `width` and `height` correspond to the width and height of the original document in which the token
......
......@@ -70,12 +70,12 @@ Tips:
```python
def normalize_bbox(bbox, width, height):
return [
int(1000 * (bbox[0] / width)),
int(1000 * (bbox[1] / height)),
int(1000 * (bbox[2] / width)),
int(1000 * (bbox[3] / height)),
]
return [
int(1000 * (bbox[0] / width)),
int(1000 * (bbox[1] / height)),
int(1000 * (bbox[2] / width)),
int(1000 * (bbox[3] / height)),
]
```
Here, `width` and `height` correspond to the width and height of the original document in which the token
......@@ -123,7 +123,7 @@ modality.
```python
from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2TokenizerFast, LayoutLMv2Processor
feature_extractor = LayoutLMv2FeatureExtractor() # apply_ocr is set to True by default
feature_extractor = LayoutLMv2FeatureExtractor() # apply_ocr is set to True by default
tokenizer = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
processor = LayoutLMv2Processor(feature_extractor, tokenizer)
```
......@@ -158,7 +158,9 @@ from PIL import Image
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
encoding = processor(image, return_tensors="pt") # you can also add all tokenizer parameters here such as padding, truncation
encoding = processor(
image, return_tensors="pt"
) # you can also add all tokenizer parameters here such as padding, truncation
print(encoding.keys())
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
```
......@@ -177,7 +179,7 @@ processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncas
image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
words = ["hello", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
encoding = processor(image, words, boxes=boxes, return_tensors="pt")
print(encoding.keys())
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
......@@ -199,7 +201,7 @@ processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncas
image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
words = ["hello", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
word_labels = [1, 2]
encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
print(encoding.keys())
......@@ -219,7 +221,7 @@ processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncas
image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
question = "What's his name?"
encoding = processor(image, question, return_tensors="pt")
encoding = processor(image, question, return_tensors="pt")
print(encoding.keys())
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
```
......@@ -238,8 +240,8 @@ processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncas
image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
question = "What's his name?"
words = ["hello", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
print(encoding.keys())
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
```
......
......@@ -34,7 +34,7 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like
```python
from transformers import LayoutLMv2Model
model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base')
model = LayoutLMv2Model.from_pretrained("microsoft/layoutxlm-base")
```
Note that LayoutXLM has its own tokenizer, based on
......@@ -44,7 +44,7 @@ follows:
```python
from transformers import LayoutXLMTokenizer
tokenizer = LayoutXLMTokenizer.from_pretrained('microsoft/layoutxlm-base')
tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
```
Similar to LayoutLMv2, you can use [`LayoutXLMProcessor`] (which internally applies
......
......@@ -75,8 +75,8 @@ For more information, please refer to the official [paper](https://arxiv.org/pdf
trained and should be used as follows:
```python
input_ids = tokenizer.encode('This is a sentence from [MASK] training data', return_tensors='pt')
mlm_labels = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
input_ids = tokenizer.encode("This is a sentence from [MASK] training data", return_tensors="pt")
mlm_labels = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
```
......
......@@ -84,24 +84,27 @@ Example:
>>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
>>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
# Example 1: Computing the contextualized entity representation corresponding to the entity mention "Beyoncé"
>>> text = "Beyoncé lives in Los Angeles."
>>> entity_spans = [(0, 7)] # character-based entity span corresponding to "Beyoncé"
>>> inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
>>> outputs = model(**inputs)
>>> word_last_hidden_state = outputs.last_hidden_state
>>> entity_last_hidden_state = outputs.entity_last_hidden_state
# Example 2: Inputting Wikipedia entities to obtain enriched contextualized representations
>>> entities = ["Beyoncé", "Los Angeles"] # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
>>> entities = [
... "Beyoncé",
... "Los Angeles",
>>> ] # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
>>> entity_spans = [(0, 7), (17, 28)] # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
>>> inputs = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
>>> outputs = model(**inputs)
>>> word_last_hidden_state = outputs.last_hidden_state
>>> entity_last_hidden_state = outputs.entity_last_hidden_state
# Example 3: Classifying the relationship between two entities using LukeForEntityPairClassification head model
>>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
>>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
>>> entity_spans = [(0, 7), (17, 28)] # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
......
......@@ -49,8 +49,8 @@ examples. To install `sentencepiece` run `pip install sentencepiece`.
```python
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
model = M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M')
tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M', src_lang="en", tgt_lang="fr")
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="fr")
src_text = "Life is like a box of chocolates."
tgt_text = "La vie est comme une boîte de chocolat."
......@@ -59,7 +59,7 @@ model_inputs = tokenizer(src_text, return_tensors="pt")
with tokenizer.as_target_tokenizer():
labels = tokenizer(tgt_text, return_tensors="pt").input_ids
loss = model(**model_inputs, labels=labels) # forward pass
loss = model(**model_inputs, labels=labels) # forward pass
```
- Generation
......
......@@ -65,13 +65,14 @@ require 3 character language codes:
```python
>>> from transformers import MarianMTModel, MarianTokenizer
>>> src_text = [
... '>>fra<< this is a sentence in english that we want to translate to french',
... '>>por<< This should go to portuguese',
... '>>esp<< And this to Spanish'
>>> ]
... ">>fra<< this is a sentence in english that we want to translate to french",
... ">>por<< This should go to portuguese",
... ">>esp<< And this to Spanish",
... ]
>>> model_name = 'Helsinki-NLP/opus-mt-en-roa'
>>> model_name = "Helsinki-NLP/opus-mt-en-roa"
>>> tokenizer = MarianTokenizer.from_pretrained(model_name)
>>> print(tokenizer.supported_language_codes)
['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']
......@@ -88,11 +89,12 @@ Here is the code to see all available pretrained models on the hub:
```python
from huggingface_hub import list_models
model_list = list_models()
org = "Helsinki-NLP"
model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
suffix = [x.split('/')[1] for x in model_ids]
old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
suffix = [x.split("/")[1] for x in model_ids]
old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()]
```
## Old Style Multi-Lingual Models
......@@ -100,7 +102,7 @@ old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language
group:
```python
```python no-style
['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU',
'Helsinki-NLP/opus-mt-ROMANCE-en',
'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA',
......@@ -129,13 +131,14 @@ Example of translating english to many romance languages, using old-style 2 char
```python
>>> from transformers import MarianMTModel, MarianTokenizer
>>> src_text = [
... '>>fr<< this is a sentence in english that we want to translate to french',
... '>>pt<< This should go to portuguese',
... '>>es<< And this to Spanish'
>>> ]
... ">>fr<< this is a sentence in english that we want to translate to french",
... ">>pt<< This should go to portuguese",
... ">>es<< And this to Spanish",
... ]
>>> model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
>>> model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
>>> tokenizer = MarianTokenizer.from_pretrained(model_name)
>>> model = MarianMTModel.from_pretrained(model_name)
......
......@@ -52,7 +52,7 @@ inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode tar
>>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
>>> # forward pass
>>> model(**inputs, labels=batch['labels'])
>>> model(**inputs, labels=batch["labels"])
```
- Generation
......@@ -106,13 +106,13 @@ model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
src_text = " UN Chief Says There Is No Military Solution in Syria"
tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
model_inputs = tokenizer(src_text, return_tensors="pt")
with tokenizer.as_target_tokenizer():
labels = tokenizer(tgt_text, return_tensors="pt").input_ids
model(**model_inputs, labels=labels) # forward pass
model(**model_inputs, labels=labels) # forward pass
```
- Generation
......
......@@ -38,7 +38,7 @@ One can directly plug in the weights of mLUKE into a LUKE model, like so:
```python
from transformers import LukeModel
model = LukeModel.from_pretrained('studio-ousia/mluke-base')
model = LukeModel.from_pretrained("studio-ousia/mluke-base")
```
Note that mLUKE has its own tokenizer, [`MLukeTokenizer`]. You can initialize it as follows:
......@@ -46,7 +46,7 @@ Note that mLUKE has its own tokenizer, [`MLukeTokenizer`]. You can initialize it
```python
from transformers import MLukeTokenizer
tokenizer = MLukeTokenizer.from_pretrained('studio-ousia/mluke-base')
tokenizer = MLukeTokenizer.from_pretrained("studio-ousia/mluke-base")
```
As mLUKE's architecture is equivalent to that of LUKE, one can refer to [LUKE's documentation page](luke) for all
......
......@@ -69,18 +69,22 @@ All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tun
```python
>>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer
>>> import torch
>>> src_text = [
... """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
>>> ]
>>> model_name = 'google/pegasus-xsum'
>>> device = 'cuda' if torch.cuda.is_available() else 'cpu'
>>> tokenizer = PegasusTokenizer.from_pretrained(model_name)
>>> model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
>>> batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
>>> translated = model.generate(**batch)
>>> tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
>>> assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
... ]
... model_name = "google/pegasus-xsum"
... device = "cuda" if torch.cuda.is_available() else "cpu"
... tokenizer = PegasusTokenizer.from_pretrained(model_name)
... model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
... batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device)
... translated = model.generate(**batch)
... tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
... assert (
... tgt_text[0]
... == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
... )
```
## PegasusConfig
......
......@@ -75,9 +75,9 @@ tensors. After setting up the tensor quantizers, one can use the following examp
```python
>>> # Find the TensorQuantizer and enable calibration
>>> for name, module in model.named_modules():
>>> if name.endswith('_input_quantizer'):
>>> module.enable_calib()
>>> module.disable_quant() # Use full precision data to calibrate
... if name.endswith("_input_quantizer"):
... module.enable_calib()
... module.disable_quant() # Use full precision data to calibrate
>>> # Feeding data samples
>>> model(x)
......@@ -85,9 +85,9 @@ tensors. After setting up the tensor quantizers, one can use the following examp
>>> # Finalize calibration
>>> for name, module in model.named_modules():
>>> if name.endswith('_input_quantizer'):
>>> module.load_calib_amax()
>>> module.enable_quant()
... if name.endswith("_input_quantizer"):
... module.load_calib_amax()
... module.enable_quant()
>>> # If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process
>>> model.cuda()
......@@ -105,6 +105,7 @@ the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Exa
```python
>>> from pytorch_quantization.nn import TensorQuantizer
>>> TensorQuantizer.use_fb_fake_quant = True
>>> # Load the calibrated model
......
......@@ -134,7 +134,7 @@ easily be trained on sequences as long as 64000 tokens.
For training, the [`ReformerModelWithLMHead`] should be used as follows:
```python
input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
input_ids = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
loss = model(input_ids, labels=input_ids)[0]
```
......
......@@ -52,11 +52,13 @@ be installed as follows: `apt install libsndfile1-dev`
>>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
>>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
......@@ -83,16 +85,22 @@ be installed as follows: `apt install libsndfile1-dev`
>>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
>>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
>>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask], forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"])
>>> generated_ids = model.generate(
... input_ids=inputs["input_features"],
... attention_mask=inputs["attention_mask"],
... forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"],
... )
>>> translation = processor.batch_decode(generated_ids)
```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment