Unverified Commit 0dcb46e7 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Final update of doctest (#22299)



* update

* update

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 89a0a9ea
...@@ -303,7 +303,7 @@ class AutoFeatureExtractor: ...@@ -303,7 +303,7 @@ class AutoFeatureExtractor:
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
>>> # If feature extractor files are in a directory (e.g. feature extractor was saved using *save_pretrained('./test/saved_model/')*) >>> # If feature extractor files are in a directory (e.g. feature extractor was saved using *save_pretrained('./test/saved_model/')*)
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/") >>> # feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/")
```""" ```"""
config = kwargs.pop("config", None) config = kwargs.pop("config", None)
trust_remote_code = kwargs.pop("trust_remote_code", False) trust_remote_code = kwargs.pop("trust_remote_code", False)
......
...@@ -306,7 +306,7 @@ class AutoImageProcessor: ...@@ -306,7 +306,7 @@ class AutoImageProcessor:
>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> # If image processor files are in a directory (e.g. image processor was saved using *save_pretrained('./test/saved_model/')*) >>> # If image processor files are in a directory (e.g. image processor was saved using *save_pretrained('./test/saved_model/')*)
>>> image_processor = AutoImageProcessor.from_pretrained("./test/saved_model/") >>> # image_processor = AutoImageProcessor.from_pretrained("./test/saved_model/")
```""" ```"""
config = kwargs.pop("config", None) config = kwargs.pop("config", None)
trust_remote_code = kwargs.pop("trust_remote_code", False) trust_remote_code = kwargs.pop("trust_remote_code", False)
......
...@@ -188,7 +188,7 @@ class AutoProcessor: ...@@ -188,7 +188,7 @@ class AutoProcessor:
>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
>>> # If processor files are in a directory (e.g. processor was saved using *save_pretrained('./test/saved_model/')*) >>> # If processor files are in a directory (e.g. processor was saved using *save_pretrained('./test/saved_model/')*)
>>> processor = AutoProcessor.from_pretrained("./test/saved_model/") >>> # processor = AutoProcessor.from_pretrained("./test/saved_model/")
```""" ```"""
config = kwargs.pop("config", None) config = kwargs.pop("config", None)
trust_remote_code = kwargs.pop("trust_remote_code", False) trust_remote_code = kwargs.pop("trust_remote_code", False)
......
...@@ -575,7 +575,7 @@ class AutoTokenizer: ...@@ -575,7 +575,7 @@ class AutoTokenizer:
>>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased") >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
>>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*) >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
>>> tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/") >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
>>> # Download vocabulary from huggingface.co and define model-specific arguments >>> # Download vocabulary from huggingface.co and define model-specific arguments
>>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True) >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
......
...@@ -640,9 +640,17 @@ def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8") ...@@ -640,9 +640,17 @@ def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8")
See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
>>> from nltk.tokenize.casual import _replace_html_entities >>> _replace_html_entities(b'Price: &pound;100') Examples:
'Price: \\xa3100' >>> print(_replace_html_entities(b'Price: &pound;100')) Price: £100 >>>
""" ```python
>>> from nltk.tokenize.casual import _replace_html_entities
>>> _replace_html_entities(b"Price: &pound;100")
'Price: \\xa3100'
>>> print(_replace_html_entities(b"Price: &pound;100"))
Price: £100
```"""
def _convert_entity(match): def _convert_entity(match):
entity_body = match.group(3) entity_body = match.group(3)
......
...@@ -316,6 +316,7 @@ class CustomDPRReaderTokenizerMixin: ...@@ -316,6 +316,7 @@ class CustomDPRReaderTokenizerMixin:
>>> outputs = model(**encoded_inputs) >>> outputs = model(**encoded_inputs)
>>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs) >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
>>> print(predicted_spans[0].text) # best span >>> print(predicted_spans[0].text) # best span
a song
```""" ```"""
input_ids = reader_input["input_ids"] input_ids = reader_input["input_ids"]
start_logits, end_logits, relevance_logits = reader_output[:3] start_logits, end_logits, relevance_logits = reader_output[:3]
......
...@@ -316,6 +316,7 @@ class CustomDPRReaderTokenizerMixin: ...@@ -316,6 +316,7 @@ class CustomDPRReaderTokenizerMixin:
>>> outputs = model(**encoded_inputs) >>> outputs = model(**encoded_inputs)
>>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs) >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
>>> print(predicted_spans[0].text) # best span >>> print(predicted_spans[0].text) # best span
a song
```""" ```"""
input_ids = reader_input["input_ids"] input_ids = reader_input["input_ids"]
start_logits, end_logits, relevance_logits = reader_output[:3] start_logits, end_logits, relevance_logits = reader_output[:3]
......
...@@ -96,7 +96,7 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer): ...@@ -96,7 +96,7 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese") >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
>>> # You can confirm both 慶応 and 慶應 are encoded to 17750 >>> # You can confirm both 慶応 and 慶應 are encoded to 17750
>>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"] >>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]
[34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281] [35993, 35998, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
>>> # Both 慶応 and 慶應 are decoded to 慶応 >>> # Both 慶応 and 慶應 are decoded to 慶応
>>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]) >>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"])
...@@ -311,6 +311,9 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer): ...@@ -311,6 +311,9 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
Example: Example:
```python ```python
>>> from transformers import GPTSanJapaneseTokenizer
>>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
>>> x_token = tokenizer("アイウエ") >>> x_token = tokenizer("アイウエ")
>>> # input_ids: | SOT | SEG | ア | イ | ウ | エ | >>> # input_ids: | SOT | SEG | ア | イ | ウ | エ |
>>> # token_type_ids: | 1 | 0 | 0 | 0 | 0 | 0 | >>> # token_type_ids: | 1 | 0 | 0 | 0 | 0 | 0 |
......
...@@ -110,13 +110,14 @@ class M2M100Tokenizer(PreTrainedTokenizer): ...@@ -110,13 +110,14 @@ class M2M100Tokenizer(PreTrainedTokenizer):
Examples: Examples:
```python ```python
>>> from transformers import M2M100Tokenizer >>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro") >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro")
>>> src_text = " UN Chief Says There Is No Military Solution in Syria" >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
>>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria" >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
>>> model(**model_inputs) # should work >>> outputs = model(**model_inputs) # should work
```""" ```"""
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -106,13 +106,13 @@ class MarianTokenizer(PreTrainedTokenizer): ...@@ -106,13 +106,13 @@ class MarianTokenizer(PreTrainedTokenizer):
Examples: Examples:
```python ```python
>>> from transformers import MarianTokenizer >>> from transformers import MarianForCausalLM, MarianTokenizer
>>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
>>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de") >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
>>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."] >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
>>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
>>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True) >>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)
# keys [input_ids, attention_mask, labels].
>>> outputs = model(**inputs) # should work >>> outputs = model(**inputs) # should work
```""" ```"""
......
...@@ -344,7 +344,7 @@ class RoFormerTokenizer(PreTrainedTokenizer): ...@@ -344,7 +344,7 @@ class RoFormerTokenizer(PreTrainedTokenizer):
>>> tokenizer = RoFormerTokenizer.from_pretrained("junnyu/roformer_chinese_base") >>> tokenizer = RoFormerTokenizer.from_pretrained("junnyu/roformer_chinese_base")
>>> tokenizer.tokenize("今天天气非常好。") >>> tokenizer.tokenize("今天天气非常好。")
# ['今', '天', '天', '气', '非常', '好', '。'] ['今', '天', '天', '气', '非常', '好', '。']
```""" ```"""
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
......
...@@ -85,7 +85,7 @@ class RoFormerTokenizerFast(PreTrainedTokenizerFast): ...@@ -85,7 +85,7 @@ class RoFormerTokenizerFast(PreTrainedTokenizerFast):
>>> tokenizer = RoFormerTokenizerFast.from_pretrained("junnyu/roformer_chinese_base") >>> tokenizer = RoFormerTokenizerFast.from_pretrained("junnyu/roformer_chinese_base")
>>> tokenizer.tokenize("今天天气非常好。") >>> tokenizer.tokenize("今天天气非常好。")
# ['今', '天', '天', '气', '非常', '好', '。'] ['今', '天', '天', '气', '非常', '好', '。']
```""" ```"""
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -88,7 +88,7 @@ def tokenize_numbers(text_array: List[str]) -> List[str]: ...@@ -88,7 +88,7 @@ def tokenize_numbers(text_array: List[str]) -> List[str]:
```python ```python
>>> tokenize_numbers(["$", "5,000", "1.73", "m"]) >>> tokenize_numbers(["$", "5,000", "1.73", "m"])
["$", "5", "@,@", "000", "1", "@.@", "73", "m"] ['$', '5', '@,@', '000', '1', '@.@', '73', 'm']
```""" ```"""
tokenized = [] tokenized = []
for i in range(len(text_array)): for i in range(len(text_array)):
...@@ -113,7 +113,7 @@ def detokenize_numbers(text: str) -> str: ...@@ -113,7 +113,7 @@ def detokenize_numbers(text: str) -> str:
```python ```python
>>> detokenize_numbers("$ 5 @,@ 000 1 @.@ 73 m") >>> detokenize_numbers("$ 5 @,@ 000 1 @.@ 73 m")
"$ 5,000 1.73 m" '$ 5,000 1.73 m'
```""" ```"""
for reg, sub in DETOKENIZE_NUMBERS: for reg, sub in DETOKENIZE_NUMBERS:
text = re.sub(reg, sub, text) text = re.sub(reg, sub, text)
......
...@@ -467,3 +467,16 @@ src/transformers/models/mvp/tokenization_mvp.py ...@@ -467,3 +467,16 @@ src/transformers/models/mvp/tokenization_mvp.py
src/transformers/models/mvp/tokenization_mvp_fast.py src/transformers/models/mvp/tokenization_mvp_fast.py
src/transformers/models/roberta/tokenization_roberta.py src/transformers/models/roberta/tokenization_roberta.py
src/transformers/models/roberta/tokenization_roberta_fast.py src/transformers/models/roberta/tokenization_roberta_fast.py
src/transformers/models/auto/feature_extraction_auto.py
src/transformers/models/auto/image_processing_auto.py
src/transformers/models/auto/processing_auto.py
src/transformers/models/auto/tokenization_auto.py
src/transformers/models/bertweet/tokenization_bertweet.py
src/transformers/models/dpr/tokenization_dpr.py
src/transformers/models/dpr/tokenization_dpr_fast.py
src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
src/transformers/models/m2m_100/tokenization_m2m_100.py
src/transformers/models/marian/tokenization_marian.py
src/transformers/models/roformer/tokenization_roformer.py
src/transformers/models/roformer/tokenization_roformer_fast.py
src/transformers/models/transfo_xl/tokenization_transfo_xl.py
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment