Unverified Commit 6de4ee61 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

Wav2 vec2 phoneme ctc tokenizer optimisation (#16817)



* Solved href rendering issue in heading

Markdown references in headings such as '####' don't render well.
Replaced it with <h4>...<a></a></h> banners.

* PhonemeTokenizer optimization using phonemizer lib

The backend should only be initialized once, otherwise it is reloaded.
Added `init_backend` function, intializes a backend attribute.
Phonemize re-uses self.backend.
Should give ~10 times faster phonemization.

* formatted file with make style

* Documentation suggestion
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update /tokenization_wav2vec2_phoneme.py based on PR suggestion
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update CONTRIBUTING.md
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 306c9ee9
...@@ -368,8 +368,7 @@ For documentation strings, 🤗 Transformers follows the [google style](https:// ...@@ -368,8 +368,7 @@ For documentation strings, 🤗 Transformers follows the [google style](https://
Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
for more information. for more information.
#### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md) **This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
### Develop on Windows ### Develop on Windows
......
...@@ -158,6 +158,9 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): ...@@ -158,6 +158,9 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
self.phonemizer_lang = phonemizer_lang self.phonemizer_lang = phonemizer_lang
self.phonemizer_backend = phonemizer_backend self.phonemizer_backend = phonemizer_backend
if do_phonemize:
self.init_backend(self.phonemizer_lang)
with open(vocab_file, encoding="utf-8") as vocab_handle: with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle) self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()} self.decoder = {v: k for k, v in self.encoder.items()}
...@@ -169,6 +172,18 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): ...@@ -169,6 +172,18 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
def get_vocab(self) -> Dict: def get_vocab(self) -> Dict:
return dict(self.encoder, **self.added_tokens_encoder) return dict(self.encoder, **self.added_tokens_encoder)
def init_backend(self, phonemizer_lang: str):
"""
Initializes the backend.
Args:
phonemizer_lang (`str`): The language to be used.
"""
requires_backends(self, "phonemizer")
from phonemizer.backend import BACKENDS
self.backend = BACKENDS[self.phonemizer_backend](phonemizer_lang, language_switch="remove-flags")
def prepare_for_tokenization( def prepare_for_tokenization(
self, self,
text: str, text: str,
...@@ -209,6 +224,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): ...@@ -209,6 +224,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
# set the correct phonemizer language # set the correct phonemizer language
if phonemizer_lang is not None: if phonemizer_lang is not None:
self.phonemizer_lang = phonemizer_lang self.phonemizer_lang = phonemizer_lang
self.init_backend(phonemizer_lang)
return (text, {}) return (text, {})
...@@ -234,23 +250,20 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): ...@@ -234,23 +250,20 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
return tokens return tokens
def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str: def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str:
requires_backends(self, "phonemizer")
from phonemizer import phonemize
from phonemizer.separator import Separator from phonemizer.separator import Separator
word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else "" word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else ""
phonemizer_lang = phonemizer_lang if phonemizer_lang is not None else self.phonemizer_lang if phonemizer_lang is not None and phonemizer_lang != self.phonemizer_lang:
self.init_backend(phonemizer_lang)
else:
phonemizer_lang = self.phonemizer_lang
separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="") separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="")
phonemes = phonemize( phonemes = self.backend.phonemize(
text, [text],
language=phonemizer_lang,
backend=self.phonemizer_backend,
separator=separator, separator=separator,
language_switch="remove-flags",
) )
phonemes = phonemes.strip() phonemes = phonemes[0].strip()
return phonemes return phonemes
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment