"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "eaabaaf750613e00fdece7245ef04ca3d1a775a9"
Unverified Commit da3c79b2 authored by Sanchit Gandhi's avatar Sanchit Gandhi Committed by GitHub
Browse files

[Whisper] Make tokenizer normalization public (#28136)

* [Whisper] Make tokenizer normalization public

* add to docs
parent e694e985
...@@ -102,6 +102,8 @@ python convert_hf_to_openai.py \ ...@@ -102,6 +102,8 @@ python convert_hf_to_openai.py \
- save_vocabulary - save_vocabulary
- batch_decode - batch_decode
- decode - decode
- basic_normalize
- normalize
## WhisperTokenizerFast ## WhisperTokenizerFast
...@@ -113,6 +115,8 @@ python convert_hf_to_openai.py \ ...@@ -113,6 +115,8 @@ python convert_hf_to_openai.py \
- save_vocabulary - save_vocabulary
- batch_decode - batch_decode
- decode - decode
- basic_normalize
- normalize
## WhisperFeatureExtractor ## WhisperFeatureExtractor
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
"""Tokenization classes for Whisper.""" """Tokenization classes for Whisper."""
import json import json
import os import os
import warnings
from functools import lru_cache from functools import lru_cache
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
...@@ -507,6 +508,20 @@ class WhisperTokenizer(PreTrainedTokenizer): ...@@ -507,6 +508,20 @@ class WhisperTokenizer(PreTrainedTokenizer):
return self.decoder.get(index, "") return self.decoder.get(index, "")
def _normalize(self, text): def _normalize(self, text):
warnings.warn(
"The private method `_normalize` is deprecated and will be removed in v5 of Transformers."
"You can normalize an input string using the Whisper English normalizer using the `normalize` method."
)
return self.normalize(text)
def _basic_normalize(self, text, remove_diacritics=False):
warnings.warn(
"The private method `_basic_normalize` is deprecated and will be removed in v5 of Transformers."
"You can normalize an input string using the Whisper basic normalizer using the `basic_normalize` method."
)
return self.basic_normalize(text, remove_diacritics=remove_diacritics)
def normalize(self, text):
""" """
Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
english text. english text.
...@@ -515,7 +530,7 @@ class WhisperTokenizer(PreTrainedTokenizer): ...@@ -515,7 +530,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
return normalizer(text) return normalizer(text)
@staticmethod @staticmethod
def _basic_normalize(text, remove_diacritics=False): def basic_normalize(text, remove_diacritics=False):
""" """
Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on
multilingual text. multilingual text.
...@@ -745,10 +760,10 @@ class WhisperTokenizer(PreTrainedTokenizer): ...@@ -745,10 +760,10 @@ class WhisperTokenizer(PreTrainedTokenizer):
text = "".join(sub_texts) text = "".join(sub_texts)
if normalize: if normalize:
clean_text = self._normalize(text) clean_text = self.normalize(text)
return clean_text return clean_text
elif basic_normalize: elif basic_normalize:
clean_text = self._basic_normalize(text, remove_diacritics=remove_diacritics) clean_text = self.basic_normalize(text, remove_diacritics=remove_diacritics)
return clean_text return clean_text
else: else:
return text return text
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
import json import json
import os import os
import re import re
import warnings
from functools import lru_cache from functools import lru_cache
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
...@@ -427,6 +428,22 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast): ...@@ -427,6 +428,22 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
# Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._normalize # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._normalize
def _normalize(self, text): def _normalize(self, text):
warnings.warn(
"The private method `_normalize` is deprecated and will be removed in v5 of Transformers."
"You can normalize an input string using the Whisper English normalizer using the `normalize` method."
)
return self.normalize(text)
# Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._basic_normalize
def _basic_normalize(self, text, remove_diacritics=False):
warnings.warn(
"The private method `_basic_normalize` is deprecated and will be removed in v5 of Transformers."
"You can normalize an input string using the Whisper basic normalizer using the `basic_normalize` method."
)
return self.basic_normalize(text, remove_diacritics=remove_diacritics)
# Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.normalize
def normalize(self, text):
""" """
Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
english text. english text.
...@@ -435,8 +452,8 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast): ...@@ -435,8 +452,8 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
return normalizer(text) return normalizer(text)
@staticmethod @staticmethod
# Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._basic_normalize # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.basic_normalize
def _basic_normalize(text, remove_diacritics=False): def basic_normalize(text, remove_diacritics=False):
""" """
Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on
multilingual text. multilingual text.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment