Unverified Commit c4a96cec authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

Wav2Vec2 meets phonemes (#14353)



* up

* add tokenizer

* improve more

* finish tokenizer

* finish

* adapt speech recognition script

* adapt convert

* more fixes

* more fixes

* update phonemizer wav2vec2

* better naming

* fix more tests

* more fixes swedish

* correct tests

* finish

* improve script

* remove file

* up

* lets get those 100 model architectures until the end of the month

* make fix-copies

* correct more

* correct script

* more fixes

* more fixes

* add to docs

* Apply suggestions from code review
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* replace assert

* fix copies

* fix docs

* new try docs

* boom boom

* update

* add phonemizer to audio tests

* make fix-copies

* up

* upload models

* some changes

* Update tests/test_tokenization_wav2vec2_phoneme.py
Co-authored-by: default avatarAnton Lozhkov <aglozhkov@gmail.com>

* more fixes

* remove @
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarAnton Lozhkov <aglozhkov@gmail.com>
parent 77d6c826
...@@ -78,7 +78,7 @@ jobs: ...@@ -78,7 +78,7 @@ jobs:
keys: keys:
- v0.4-torch_and_tf-{{ checksum "setup.py" }} - v0.4-torch_and_tf-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision] - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
...@@ -116,7 +116,7 @@ jobs: ...@@ -116,7 +116,7 @@ jobs:
keys: keys:
- v0.4-torch_and_tf-{{ checksum "setup.py" }} - v0.4-torch_and_tf-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision] - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
...@@ -149,7 +149,7 @@ jobs: ...@@ -149,7 +149,7 @@ jobs:
keys: keys:
- v0.4-torch_and_flax-{{ checksum "setup.py" }} - v0.4-torch_and_flax-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision] - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
...@@ -186,7 +186,7 @@ jobs: ...@@ -186,7 +186,7 @@ jobs:
keys: keys:
- v0.4-torch_and_flax-{{ checksum "setup.py" }} - v0.4-torch_and_flax-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision] - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
...@@ -217,7 +217,7 @@ jobs: ...@@ -217,7 +217,7 @@ jobs:
keys: keys:
- v0.4-torch-{{ checksum "setup.py" }} - v0.4-torch-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm] - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
...@@ -253,7 +253,7 @@ jobs: ...@@ -253,7 +253,7 @@ jobs:
keys: keys:
- v0.4-torch-{{ checksum "setup.py" }} - v0.4-torch-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm] - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
...@@ -284,7 +284,7 @@ jobs: ...@@ -284,7 +284,7 @@ jobs:
keys: keys:
- v0.4-tf-{{ checksum "setup.py" }} - v0.4-tf-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision] - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
- run: pip install tensorflow_probability - run: pip install tensorflow_probability
...@@ -320,7 +320,7 @@ jobs: ...@@ -320,7 +320,7 @@ jobs:
keys: keys:
- v0.4-tf-{{ checksum "setup.py" }} - v0.4-tf-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision] - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
- run: pip install tensorflow_probability - run: pip install tensorflow_probability
...@@ -351,7 +351,7 @@ jobs: ...@@ -351,7 +351,7 @@ jobs:
keys: keys:
- v0.4-flax-{{ checksum "setup.py" }} - v0.4-flax-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[flax,testing,sentencepiece,flax-speech,vision] - run: pip install .[flax,testing,sentencepiece,flax-speech,vision]
- run: pip install https://github.com/kpu/kenlm/archive/master.zip - run: pip install https://github.com/kpu/kenlm/archive/master.zip
...@@ -386,7 +386,7 @@ jobs: ...@@ -386,7 +386,7 @@ jobs:
keys: keys:
- v0.4-flax-{{ checksum "setup.py" }} - v0.4-flax-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[flax,testing,sentencepiece,vision,flax-speech] - run: pip install .[flax,testing,sentencepiece,vision,flax-speech]
- run: pip install https://github.com/kpu/kenlm/archive/master.zip - run: pip install https://github.com/kpu/kenlm/archive/master.zip
...@@ -417,7 +417,7 @@ jobs: ...@@ -417,7 +417,7 @@ jobs:
keys: keys:
- v0.4-torch-{{ checksum "setup.py" }} - v0.4-torch-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm] - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
...@@ -454,7 +454,7 @@ jobs: ...@@ -454,7 +454,7 @@ jobs:
keys: keys:
- v0.4-torch-{{ checksum "setup.py" }} - v0.4-torch-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm] - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
...@@ -579,7 +579,7 @@ jobs: ...@@ -579,7 +579,7 @@ jobs:
keys: keys:
- v0.4-torch_examples-{{ checksum "setup.py" }} - v0.4-torch_examples-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech] - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
- run: pip install -r examples/pytorch/_tests_requirements.txt - run: pip install -r examples/pytorch/_tests_requirements.txt
...@@ -614,7 +614,7 @@ jobs: ...@@ -614,7 +614,7 @@ jobs:
keys: keys:
- v0.4-torch_examples-{{ checksum "setup.py" }} - v0.4-torch_examples-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech] - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
- run: pip install -r examples/pytorch/_tests_requirements.txt - run: pip install -r examples/pytorch/_tests_requirements.txt
......
...@@ -31,7 +31,7 @@ jobs: ...@@ -31,7 +31,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
apt install -y libsndfile1-dev apt install -y libsndfile1-dev espeak-ng
pip install --upgrade pip pip install --upgrade pip
pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
pip install https://github.com/kpu/kenlm/archive/master.zip pip install https://github.com/kpu/kenlm/archive/master.zip
...@@ -84,7 +84,7 @@ jobs: ...@@ -84,7 +84,7 @@ jobs:
steps: steps:
- name: Install dependencies - name: Install dependencies
run: | run: |
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
pip install --upgrade pip pip install --upgrade pip
pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision] pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
...@@ -141,7 +141,7 @@ jobs: ...@@ -141,7 +141,7 @@ jobs:
# steps: # steps:
# - name: Install dependencies # - name: Install dependencies
# run: | # run: |
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git # apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
# pip install --upgrade pip # pip install --upgrade pip
# pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech] # pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
# pip install https://github.com/kpu/kenlm/archive/master.zip # pip install https://github.com/kpu/kenlm/archive/master.zip
...@@ -199,8 +199,8 @@ jobs: ...@@ -199,8 +199,8 @@ jobs:
steps: steps:
- name: Install dependencies - name: Install dependencies
run: | run: |
apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
apt install -y libsndfile1-dev apt install -y libsndfile1-dev espeak-ng
pip install --upgrade pip pip install --upgrade pip
pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
pip install https://github.com/kpu/kenlm/archive/master.zip pip install https://github.com/kpu/kenlm/archive/master.zip
...@@ -255,7 +255,7 @@ jobs: ...@@ -255,7 +255,7 @@ jobs:
# steps: # steps:
# - name: Install dependencies # - name: Install dependencies
# run: | # run: |
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git # apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
# pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html # pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
# pip install --upgrade pip # pip install --upgrade pip
# pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision] # pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
...@@ -312,7 +312,7 @@ jobs: ...@@ -312,7 +312,7 @@ jobs:
# steps: # steps:
# - name: Install dependencies # - name: Install dependencies
# run: | # run: |
# apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git # apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
# pip install --upgrade pip # pip install --upgrade pip
# pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech] # pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
# pip install https://github.com/kpu/kenlm/archive/master.zip # pip install https://github.com/kpu/kenlm/archive/master.zip
......
...@@ -33,7 +33,7 @@ jobs: ...@@ -33,7 +33,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
apt -y update && apt install -y libsndfile1-dev git apt -y update && apt install -y libsndfile1-dev git espeak-ng
pip install --upgrade pip pip install --upgrade pip
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
pip install https://github.com/kpu/kenlm/archive/master.zip pip install https://github.com/kpu/kenlm/archive/master.zip
...@@ -140,7 +140,7 @@ jobs: ...@@ -140,7 +140,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
apt -y update && apt install -y libsndfile1-dev git apt -y update && apt install -y libsndfile1-dev git espeak-ng
pip install --upgrade pip pip install --upgrade pip
pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision] pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]
pip install https://github.com/kpu/kenlm/archive/master.zip pip install https://github.com/kpu/kenlm/archive/master.zip
...@@ -237,7 +237,7 @@ jobs: ...@@ -237,7 +237,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
apt -y update && apt install -y libsndfile1-dev git apt -y update && apt install -y libsndfile1-dev git espeak-ng
pip install --upgrade pip pip install --upgrade pip
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
pip install https://github.com/kpu/kenlm/archive/master.zip pip install https://github.com/kpu/kenlm/archive/master.zip
......
...@@ -313,11 +313,13 @@ AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Ch ...@@ -313,11 +313,13 @@ AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Ch
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. 1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR. 1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks). To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks).
......
...@@ -290,11 +290,13 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 ...@@ -290,11 +290,13 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. 1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
1. 새로운 모델을 올리고 싶나요? 우리가 **상세한 가이드와 템플릿** 으로 새로운 모델을 올리도록 도와드릴게요. 가이드와 템플릿은 이 저장소의 [`templates`](./templates) 폴더에서 확인하실 수 있습니다. [컨트리뷰션 가이드라인](./CONTRIBUTING.md)을 꼭 확인해주시고, PR을 올리기 전에 메인테이너에게 연락하거나 이슈를 오픈해 피드백을 받으시길 바랍니다. 1. 새로운 모델을 올리고 싶나요? 우리가 **상세한 가이드와 템플릿** 으로 새로운 모델을 올리도록 도와드릴게요. 가이드와 템플릿은 이 저장소의 [`templates`](./templates) 폴더에서 확인하실 수 있습니다. [컨트리뷰션 가이드라인](./CONTRIBUTING.md)을 꼭 확인해주시고, PR을 올리기 전에 메인테이너에게 연락하거나 이슈를 오픈해 피드백을 받으시길 바랍니다.
......
...@@ -314,11 +314,13 @@ conda install -c huggingface transformers ...@@ -314,11 +314,13 @@ conda install -c huggingface transformers
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。
1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. 1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。
1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (来自 Facebook AI) 伴随论文 [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 由 Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli 发布。
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。
1. 想要贡献新的模型?我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。 1. 想要贡献新的模型?我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。
......
...@@ -326,11 +326,13 @@ conda install -c huggingface transformers ...@@ -326,11 +326,13 @@ conda install -c huggingface transformers
1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. 1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
1. 想要貢獻新的模型?我們這裡有一份**詳細指引和模板**來引導你加入新的模型。你可以在 [`templates`](./templates) 目錄中找到它們。記得查看[貢獻指引](./CONTRIBUTING.md)並在開始寫 PR 前聯繫維護人員或開一個新的 issue 來獲得 feedbacks。 1. 想要貢獻新的模型?我們這裡有一份**詳細指引和模板**來引導你加入新的模型。你可以在 [`templates`](./templates) 目錄中找到它們。記得查看[貢獻指引](./CONTRIBUTING.md)並在開始寫 PR 前聯繫維護人員或開一個新的 issue 來獲得 feedbacks。
......
...@@ -284,6 +284,8 @@ ...@@ -284,6 +284,8 @@
title: VisualBERT title: VisualBERT
- local: model_doc/wav2vec2 - local: model_doc/wav2vec2
title: Wav2Vec2 title: Wav2Vec2
- local: model_doc/wav2vec2_phoneme
title: Wav2Vec2Phoneme
- local: model_doc/wavlm - local: model_doc/wavlm
title: WavLM title: WavLM
- local: model_doc/xlm - local: model_doc/xlm
...@@ -296,6 +298,8 @@ ...@@ -296,6 +298,8 @@
title: XLNet title: XLNet
- local: model_doc/xlsr_wav2vec2 - local: model_doc/xlsr_wav2vec2
title: XLSR-Wav2Vec2 title: XLSR-Wav2Vec2
- local: model_doc/xls_r
title: XLS-R
title: Models title: Models
- sections: - sections:
- local: internal/modeling_utils - local: internal/modeling_utils
......
...@@ -172,11 +172,13 @@ conversion utilities for the following models. ...@@ -172,11 +172,13 @@ conversion utilities for the following models.
1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. 1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei. 1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. 1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau. 1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
1. **[XLM-ProphetNet](model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[XLM-ProphetNet](model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
1. **[XLM-RoBERTa](model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. 1. **[XLM-RoBERTa](model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. 1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. 1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
### Supported frameworks ### Supported frameworks
......
<!--Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
# Wav2Vec2Phoneme
## Overview
The Wav2Vec2Phoneme model was proposed in [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition (Xu et al.,
2021](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
The abstract from the paper is the following:
*Recent progress in self-training, self-supervised pretraining and unsupervised learning enabled well performing speech
recognition systems without any labeled data. However, in many cases there is labeled data available for related
languages which is not utilized by these methods. This paper extends previous work on zero-shot cross-lingual transfer
learning by fine-tuning a multilingually pretrained wav2vec 2.0 model to transcribe unseen languages. This is done by
mapping phonemes of the training languages to the target language using articulatory features. Experiments show that
this simple method significantly outperforms prior work which introduced task-specific architectures and used only part
of a monolingually pretrained model.*
Tips:
- Wav2Vec2Phoneme uses the exact same architecture as Wav2Vec2
- Wav2Vec2Phoneme is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
- Wav2Vec2Phoneme model was trained using connectionist temporal classification (CTC) so the model output has to be
decoded using [`Wav2Vec2PhonemeCTCTokenizer`].
- Wav2Vec2Phoneme can be fine-tuned on multiple language at once and decode unseen languages in a single forward pass
to a sequence of phonemes
- By default the model outputs a sequence of phonemes. In order to transform the phonemes to a sequence of words one
should make use of a dictionary and language model.
Relevant checkpoints can be found under https://huggingface.co/models?other=phoneme-recognition.
This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten)
The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec).
Wav2Vec2Phoneme's architecture is based on the Wav2Vec2 model, so one can refer to [`Wav2Vec2`]'s documentation page except for the tokenizer.
## Wav2Vec2PhonemeCTCTokenizer
[[autodoc]] Wav2Vec2PhonemeCTCTokenizer
- __call__
- batch_decode
- decode
- phonemize
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
XLS-R
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The XLS-R model was proposed in `XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale
<https://arxiv.org/abs/2111.09296>`__ by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman
Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
The abstract from the paper is the following:
*This paper presents XLS-R, a large-scale model for cross-lingual speech representation learning based on wav2vec 2.0.
We train models with up to 2B parameters on nearly half a million hours of publicly available speech audio in 128
languages, an order of magnitude more public data than the largest known prior work. Our evaluation covers a wide range
of tasks, domains, data regimes and languages, both high and low-resource. On the CoVoST-2 speech translation
benchmark, we improve the previous state of the art by an average of 7.4 BLEU over 21 translation directions into
English. For speech recognition, XLS-R improves over the best known prior work on BABEL, MLS, CommonVoice as well as
VoxPopuli, lowering error rates by 14-34% relative on average. XLS-R also sets a new state of the art on VoxLingua107
language identification. Moreover, we show that with sufficient model size, cross-lingual pretraining can outperform
English-only pretraining when translating English speech into other languages, a setting which favors monolingual
pretraining. We hope XLS-R can help to improve speech processing tasks for many more languages of the world.*
Tips:
- XLS-R is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
- XLS-R model was trained using connectionist temporal classification (CTC) so the model output has to be decoded using
:class:`~transformers.Wav2Vec2CTCTokenizer`.
Relevant checkpoints can be found under https://huggingface.co/models?other=xls_r.
XLS-R's architecture is based on the Wav2Vec2 model, so one can refer to :doc:`Wav2Vec2's documentation page
<wav2vec2>`.
The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec>`__.
...@@ -21,6 +21,7 @@ import logging ...@@ -21,6 +21,7 @@ import logging
import os import os
import re import re
import sys import sys
import warnings
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Dict, List, Optional, Union from typing import Dict, List, Optional, Union
...@@ -34,6 +35,7 @@ from transformers import ( ...@@ -34,6 +35,7 @@ from transformers import (
AutoConfig, AutoConfig,
AutoFeatureExtractor, AutoFeatureExtractor,
AutoModelForCTC, AutoModelForCTC,
AutoProcessor,
AutoTokenizer, AutoTokenizer,
HfArgumentParser, HfArgumentParser,
Trainer, Trainer,
...@@ -68,6 +70,10 @@ class ModelArguments: ...@@ -68,6 +70,10 @@ class ModelArguments:
model_name_or_path: str = field( model_name_or_path: str = field(
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
) )
tokenizer_name_or_path: Optional[str] = field(
default=None,
metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
)
cache_dir: Optional[str] = field( cache_dir: Optional[str] = field(
default=None, default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
...@@ -191,7 +197,7 @@ class DataTrainingArguments: ...@@ -191,7 +197,7 @@ class DataTrainingArguments:
max_duration_in_seconds: Optional[float] = field( max_duration_in_seconds: Optional[float] = field(
default=20.0, default=20.0,
metadata={ metadata={
"help": "Truncate audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`" "help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
}, },
) )
min_duration_in_seconds: Optional[float] = field( min_duration_in_seconds: Optional[float] = field(
...@@ -210,7 +216,28 @@ class DataTrainingArguments: ...@@ -210,7 +216,28 @@ class DataTrainingArguments:
default=False, default=False,
metadata={ metadata={
"help": "If :obj:`True`, will use the token generated when running" "help": "If :obj:`True`, will use the token generated when running"
":obj:`transformers-cli logiin as HTTP bearer authorization for remote files." ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
},
)
unk_token: Optional[str] = field(
default="[UNK]",
metadata={"help": "The unk token for the tokenizer"},
)
pad_token: Optional[str] = field(
default="[PAD]",
metadata={"help": "The padding token for the tokenizer"},
)
word_delimiter_token: Optional[str] = field(
default="|",
metadata={"help": "The word delimiter token for the tokenizer"},
)
phoneme_language: Optional[str] = field(
default=None,
metadata={
"help": "The target language that should be used be"
" passed to the tokenizer for tokenization. Note that"
" this is only relevant if the model classifies the"
" input audio to a sequence of phoneme sequences."
}, },
) )
...@@ -220,7 +247,7 @@ class DataCollatorCTCWithPadding: ...@@ -220,7 +247,7 @@ class DataCollatorCTCWithPadding:
""" """
Data collator that will dynamically pad the inputs received. Data collator that will dynamically pad the inputs received.
Args: Args:
processor (:class:`~transformers.Wav2Vec2Processor`) processor (:class:`~transformers.AutoProcessor`)
The processor used for proccessing the data. The processor used for proccessing the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index) Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
...@@ -241,7 +268,7 @@ class DataCollatorCTCWithPadding: ...@@ -241,7 +268,7 @@ class DataCollatorCTCWithPadding:
7.5 (Volta). 7.5 (Volta).
""" """
processor: Wav2Vec2Processor processor: AutoProcessor
padding: Union[bool, str] = "longest" padding: Union[bool, str] = "longest"
pad_to_multiple_of: Optional[int] = None pad_to_multiple_of: Optional[int] = None
pad_to_multiple_of_labels: Optional[int] = None pad_to_multiple_of_labels: Optional[int] = None
...@@ -275,7 +302,12 @@ class DataCollatorCTCWithPadding: ...@@ -275,7 +302,12 @@ class DataCollatorCTCWithPadding:
return batch return batch
def create_vocabulary_from_data(datasets: DatasetDict): def create_vocabulary_from_data(
datasets: DatasetDict,
word_delimiter_token: Optional[str] = None,
unk_token: Optional[str] = None,
pad_token: Optional[str] = None,
):
# Given training and test labels create vocabulary # Given training and test labels create vocabulary
def extract_all_chars(batch): def extract_all_chars(batch):
all_text = " ".join(batch["target_text"]) all_text = " ".join(batch["target_text"])
...@@ -298,12 +330,16 @@ def create_vocabulary_from_data(datasets: DatasetDict): ...@@ -298,12 +330,16 @@ def create_vocabulary_from_data(datasets: DatasetDict):
vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))} vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
# replace white space with delimiter token # replace white space with delimiter token
vocab_dict["|"] = vocab_dict[" "] if word_delimiter_token is not None:
vocab_dict[word_delimiter_token] = vocab_dict[" "]
del vocab_dict[" "] del vocab_dict[" "]
# add unk and pad token # add unk and pad token
vocab_dict["[UNK]"] = len(vocab_dict) if unk_token is not None:
vocab_dict["[PAD]"] = len(vocab_dict) vocab_dict[unk_token] = len(vocab_dict)
if pad_token is not None:
vocab_dict[pad_token] = len(vocab_dict)
return vocab_dict return vocab_dict
...@@ -359,12 +395,11 @@ def main(): ...@@ -359,12 +395,11 @@ def main():
# 1. First, let's load the dataset # 1. First, let's load the dataset
raw_datasets = DatasetDict() raw_datasets = DatasetDict()
if training_args.do_train:
raw_datasets["train"] = load_dataset( raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name
) )
raw_datasets["eval"] = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name
)
if data_args.audio_column_name not in raw_datasets["train"].column_names: if data_args.audio_column_name not in raw_datasets["train"].column_names:
raise ValueError( raise ValueError(
...@@ -380,10 +415,14 @@ def main(): ...@@ -380,10 +415,14 @@ def main():
f"{', '.join(raw_datasets['train'].column_names)}." f"{', '.join(raw_datasets['train'].column_names)}."
) )
# prepare dataset
if data_args.max_train_samples is not None: if data_args.max_train_samples is not None:
raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples)) raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
if training_args.do_eval:
raw_datasets["eval"] = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name
)
if data_args.max_eval_samples is not None: if data_args.max_eval_samples is not None:
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples)) raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
...@@ -391,30 +430,49 @@ def main(): ...@@ -391,30 +430,49 @@ def main():
# that make training complicated and do not help in transcribing the speech # that make training complicated and do not help in transcribing the speech
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
# that could be easily picked up by the model # that could be easily picked up by the model
chars_to_ignore_regex = ( chars_to_ignore_regex = (
f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
) )
text_column_name = data_args.text_column_name
def remove_special_characters(batch): def remove_special_characters(batch):
if chars_to_ignore_regex is not None: if chars_to_ignore_regex is not None:
batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[data_args.text_column_name]).lower() + " " batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
else: else:
batch["target_text"] = batch[data_args.text_column_name].lower() + " " batch["target_text"] = batch[text_column_name].lower() + " "
return batch return batch
with training_args.main_process_first(desc="dataset map special characters removal"): with training_args.main_process_first(desc="dataset map special characters removal"):
raw_datasets = raw_datasets.map( raw_datasets = raw_datasets.map(
remove_special_characters, remove_special_characters,
remove_columns=[data_args.text_column_name], remove_columns=[text_column_name],
desc="remove special characters from datasets", desc="remove special characters from datasets",
) )
# 3. Next, we create the vocabulary of the model by extracting all unique characters from # save special tokens for tokenizer
word_delimiter_token = data_args.word_delimiter_token
unk_token = data_args.unk_token
pad_token = data_args.pad_token
# 3. Next, let's load the config as we might need it to create
# the tokenizer
# load config
config = AutoConfig.from_pretrained(
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
)
# 4. Next, if no tokenizer file is defined,
# we create the vocabulary of the model by extracting all unique characters from
# the training and evaluation datasets # the training and evaluation datasets
# We need to make sure that only first rank saves vocabulary # We need to make sure that only first rank saves vocabulary
# make sure all processes wait until vocab is created # make sure all processes wait until vocab is created
vocab_file = os.path.join(training_args.output_dir, "vocab.json") tokenizer_name_or_path = model_args.tokenizer_name_or_path
tokenizer_type_hints = {}
if tokenizer_name_or_path is None:
# save vocab in training output dir
tokenizer_name_or_path = training_args.output_dir
vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
with training_args.main_process_first(): with training_args.main_process_first():
if training_args.overwrite_output_dir and os.path.isfile(vocab_file): if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
...@@ -422,40 +480,41 @@ def main(): ...@@ -422,40 +480,41 @@ def main():
with training_args.main_process_first(desc="dataset map vocabulary creation"): with training_args.main_process_first(desc="dataset map vocabulary creation"):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
os.makedirs(training_args.output_dir, exist_ok=True) os.makedirs(tokenizer_name_or_path, exist_ok=True)
vocab_dict = create_vocabulary_from_data(raw_datasets) vocab_dict = create_vocabulary_from_data(
raw_datasets,
word_delimiter_token=word_delimiter_token,
unk_token=unk_token,
pad_token=pad_token,
)
# save vocab dict to be loaded into tokenizer # save vocab dict to be loaded into tokenizer
with open(vocab_file, "w") as file: with open(vocab_file, "w") as file:
json.dump(vocab_dict, file) json.dump(vocab_dict, file)
# 4. Now we can instantiate the configuration, feature extractor, tokenizer and model # if tokenizer has just been created
# it is defined by `tokenizer_class` if present in config else by `model_type`
tokenizer_type_hints = {
"config": config if config.tokenizer_class is not None else None,
"tokenizer_type": config.model_type if config.tokenizer_class is None else None,
}
# 5. Now we can instantiate the feature extractor, tokenizer and model
# Note for distributed training, the .from_pretrained methods guarantee that only # Note for distributed training, the .from_pretrained methods guarantee that only
# one local process can concurrently download model & vocab. # one local process can concurrently download model & vocab.
# load config # load feature_extractor and tokenizer
config = AutoConfig.from_pretrained(
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
)
# tokenizer is defined by `tokenizer_class` if present in config else by `model_type`
config_for_tokenizer = config if config.tokenizer_class is not None else None
tokenizer_type = config.model_type if config.tokenizer_class is None else None
# load feature_extractor, tokenizer and create processor
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
training_args.output_dir, tokenizer_name_or_path,
config=config_for_tokenizer, unk_token=unk_token,
tokenizer_type=tokenizer_type, pad_token=pad_token,
unk_token="[UNK]", word_delimiter_token=word_delimiter_token,
pad_token="[PAD]",
word_delimiter_token="|",
use_auth_token=data_args.use_auth_token, use_auth_token=data_args.use_auth_token,
**tokenizer_type_hints,
) )
feature_extractor = AutoFeatureExtractor.from_pretrained( feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
) )
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# adapt config # adapt config
config.update( config.update(
...@@ -471,8 +530,8 @@ def main(): ...@@ -471,8 +530,8 @@ def main():
"gradient_checkpointing": training_args.gradient_checkpointing, "gradient_checkpointing": training_args.gradient_checkpointing,
"layerdrop": model_args.layerdrop, "layerdrop": model_args.layerdrop,
"ctc_loss_reduction": model_args.ctc_loss_reduction, "ctc_loss_reduction": model_args.ctc_loss_reduction,
"pad_token_id": processor.tokenizer.pad_token_id, "pad_token_id": tokenizer.pad_token_id,
"vocab_size": len(processor.tokenizer), "vocab_size": len(tokenizer),
"activation_dropout": model_args.activation_dropout, "activation_dropout": model_args.activation_dropout,
} }
) )
...@@ -489,55 +548,64 @@ def main(): ...@@ -489,55 +548,64 @@ def main():
if model_args.freeze_feature_extractor: if model_args.freeze_feature_extractor:
model.freeze_feature_extractor() model.freeze_feature_extractor()
# 5. Now we preprocess the datasets including loading the audio, resampling and normalization # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
# Thankfully, `datasets` takes care of automatically loading and resampling the audio, # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
# so that we just need to set the correct target sampling rate and normalize the input # so that we just need to set the correct target sampling rate and normalize the input
# via the `feature_extractor` # via the `feature_extractor`
# make sure that dataset decodes audio with correct sampling rate # make sure that dataset decodes audio with correct sampling rate
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
if dataset_sampling_rate != feature_extractor.sampling_rate:
raw_datasets = raw_datasets.cast_column( raw_datasets = raw_datasets.cast_column(
data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate) data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
) )
# derive max & min input length for sample rate & max duration # derive max & min input length for sample rate & max duration
max_input_length = data_args.max_duration_in_seconds * processor.feature_extractor.sampling_rate max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
min_input_length = data_args.min_duration_in_seconds * processor.feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
audio_column_name = data_args.audio_column_name
num_workers = data_args.preprocessing_num_workers
# `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
phoneme_language = data_args.phoneme_language
# Preprocessing the datasets. # Preprocessing the datasets.
# We need to read the audio files as arrays and tokenize the targets. # We need to read the audio files as arrays and tokenize the targets.
def prepare_dataset(batch): def prepare_dataset(batch):
# load audio # load audio
sample = batch[data_args.audio_column_name] sample = batch[audio_column_name]
batch["input_values"] = processor( inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
sample["array"], sampling_rate=sample["sampling_rate"], truncate=True, max_length=max_input_length batch["input_values"] = inputs.input_values[0]
).input_values[0]
batch["input_length"] = len(batch["input_values"]) batch["input_length"] = len(batch["input_values"])
# Setup the processor for targets # encode targets
with processor.as_target_processor(): additional_kwargs = {}
batch["labels"] = processor(batch["target_text"]).input_ids if phoneme_language is not None:
additional_kwargs["phonemizer_lang"] = phoneme_language
batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
return batch return batch
with training_args.main_process_first(desc="dataset map preprocessing"): with training_args.main_process_first(desc="dataset map preprocessing"):
vectorized_datasets = raw_datasets.map( vectorized_datasets = raw_datasets.map(
prepare_dataset, prepare_dataset,
remove_columns=raw_datasets["train"].column_names, remove_columns=next(iter(raw_datasets.values())).column_names,
num_proc=data_args.preprocessing_num_workers, num_proc=num_workers,
desc="preprocess datasets", desc="preprocess datasets",
) )
if min_input_length > 0.0: def is_audio_in_length_range(length):
return length > min_input_length and length < max_input_length
# filter data that is shorter than min_input_length # filter data that is shorter than min_input_length
vectorized_datasets = vectorized_datasets.filter( vectorized_datasets = vectorized_datasets.filter(
lambda x: x > min_input_length, is_audio_in_length_range,
num_proc=data_args.preprocessing_num_workers, num_proc=num_workers,
input_columns=["input_length"], input_columns=["input_length"],
) )
vectorized_datasets = vectorized_datasets.remove_columns("input_length") # 7. Next, we can prepare the training.
# 6. Next, we can prepare the training.
# Let's use word error rate (WER) as our evaluation metric, # Let's use word error rate (WER) as our evaluation metric,
# instantiate a data collator and the trainer # instantiate a data collator and the trainer
...@@ -557,16 +625,36 @@ def main(): ...@@ -557,16 +625,36 @@ def main():
pred_logits = pred.predictions pred_logits = pred.predictions
pred_ids = np.argmax(pred_logits, axis=-1) pred_ids = np.argmax(pred_logits, axis=-1)
pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
pred_str = processor.batch_decode(pred_ids) pred_str = tokenizer.batch_decode(pred_ids)
# we do not want to group tokens when computing the metrics # we do not want to group tokens when computing the metrics
label_str = processor.batch_decode(pred.label_ids, group_tokens=False) label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()} metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
return metrics return metrics
# Now create a single processor
if is_main_process(training_args.local_rank):
# save feature extractor, tokenizer and config
feature_extractor.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)
config.save_pretrained(training_args.output_dir)
# load processor
try:
processor = AutoProcessor.from_pretrained(training_args.output_dir)
except (OSError, KeyError):
warnings.warn(
"Loading a processor from a feature extractor config that does not"
" include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
" attribute to your `preprocessor_config.json` file to suppress this warning: "
" `'processor_class': 'Wav2Vec2Processor'`",
FutureWarning,
)
processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
# Instantiate custom data collator # Instantiate custom data collator
data_collator = DataCollatorCTCWithPadding(processor=processor) data_collator = DataCollatorCTCWithPadding(processor=processor)
...@@ -578,10 +666,10 @@ def main(): ...@@ -578,10 +666,10 @@ def main():
compute_metrics=compute_metrics, compute_metrics=compute_metrics,
train_dataset=vectorized_datasets["train"] if training_args.do_train else None, train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
tokenizer=processor.feature_extractor, tokenizer=feature_extractor,
) )
# 7. Finally, we can start training # 8. Finally, we can start training
# Training # Training
if training_args.do_train: if training_args.do_train:
...@@ -594,10 +682,6 @@ def main(): ...@@ -594,10 +682,6 @@ def main():
else: else:
checkpoint = None checkpoint = None
# Save the feature_extractor and the tokenizer
if is_main_process(training_args.local_rank):
processor.save_pretrained(training_args.output_dir)
train_result = trainer.train(resume_from_checkpoint=checkpoint) train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() trainer.save_model()
......
...@@ -123,6 +123,7 @@ _deps = [ ...@@ -123,6 +123,7 @@ _deps = [
"optax>=0.0.8", "optax>=0.0.8",
"packaging>=20.0", "packaging>=20.0",
"parameterized", "parameterized",
"phonemizer",
"protobuf", "protobuf",
"psutil", "psutil",
"pyyaml>=5.1", "pyyaml>=5.1",
...@@ -254,7 +255,7 @@ extras["sigopt"] = deps_list("sigopt") ...@@ -254,7 +255,7 @@ extras["sigopt"] = deps_list("sigopt")
extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"] extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette") extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
extras["audio"] = deps_list("librosa", "pyctcdecode") extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer")
extras["speech"] = deps_list("torchaudio") + extras["audio"] # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead extras["speech"] = deps_list("torchaudio") + extras["audio"] # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
extras["torch-speech"] = deps_list("torchaudio") + extras["audio"] extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
extras["tf-speech"] = extras["audio"] extras["tf-speech"] = extras["audio"]
......
...@@ -116,6 +116,7 @@ _import_structure = { ...@@ -116,6 +116,7 @@ _import_structure = {
"is_datasets_available", "is_datasets_available",
"is_faiss_available", "is_faiss_available",
"is_flax_available", "is_flax_available",
"is_phonemizer_available",
"is_psutil_available", "is_psutil_available",
"is_py3nvml_available", "is_py3nvml_available",
"is_pyctcdecode_available", "is_pyctcdecode_available",
...@@ -313,6 +314,7 @@ _import_structure = { ...@@ -313,6 +314,7 @@ _import_structure = {
"Wav2Vec2Processor", "Wav2Vec2Processor",
"Wav2Vec2Tokenizer", "Wav2Vec2Tokenizer",
], ],
"models.wav2vec2_phoneme": ["Wav2Vec2PhonemeCTCTokenizer"],
"models.wav2vec2_with_lm": ["Wav2Vec2ProcessorWithLM"], "models.wav2vec2_with_lm": ["Wav2Vec2ProcessorWithLM"],
"models.wavlm": [ "models.wavlm": [
"WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP",
...@@ -2158,6 +2160,7 @@ if TYPE_CHECKING: ...@@ -2158,6 +2160,7 @@ if TYPE_CHECKING:
is_datasets_available, is_datasets_available,
is_faiss_available, is_faiss_available,
is_flax_available, is_flax_available,
is_phonemizer_available,
is_psutil_available, is_psutil_available,
is_py3nvml_available, is_py3nvml_available,
is_pyctcdecode_available, is_pyctcdecode_available,
...@@ -2339,6 +2342,7 @@ if TYPE_CHECKING: ...@@ -2339,6 +2342,7 @@ if TYPE_CHECKING:
Wav2Vec2Processor, Wav2Vec2Processor,
Wav2Vec2Tokenizer, Wav2Vec2Tokenizer,
) )
from .models.wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizer
from .models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM from .models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
from .models.wavlm import WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP, WavLMConfig from .models.wavlm import WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP, WavLMConfig
from .models.xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig, XLMTokenizer from .models.xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig, XLMTokenizer
......
...@@ -34,6 +34,7 @@ deps = { ...@@ -34,6 +34,7 @@ deps = {
"optax": "optax>=0.0.8", "optax": "optax>=0.0.8",
"packaging": "packaging>=20.0", "packaging": "packaging>=20.0",
"parameterized": "parameterized", "parameterized": "parameterized",
"phonemizer": "phonemizer",
"protobuf": "protobuf", "protobuf": "protobuf",
"psutil": "psutil", "psutil": "psutil",
"pyyaml": "pyyaml>=5.1", "pyyaml": "pyyaml>=5.1",
......
...@@ -237,6 +237,14 @@ except importlib_metadata.PackageNotFoundError: ...@@ -237,6 +237,14 @@ except importlib_metadata.PackageNotFoundError:
_torchaudio_available = False _torchaudio_available = False
_phonemizer_available = importlib.util.find_spec("phonemizer") is not None
try:
_phonemizer_version = importlib_metadata.version("phonemizer")
logger.debug(f"Successfully imported phonemizer version {_phonemizer_version}")
except importlib_metadata.PackageNotFoundError:
_phonemizer_available = False
_pyctcdecode_available = importlib.util.find_spec("pyctcdecode") is not None _pyctcdecode_available = importlib.util.find_spec("pyctcdecode") is not None
try: try:
_pyctcdecode_version = importlib_metadata.version("pyctcdecode") _pyctcdecode_version = importlib_metadata.version("pyctcdecode")
...@@ -592,6 +600,10 @@ def is_speech_available(): ...@@ -592,6 +600,10 @@ def is_speech_available():
return _torchaudio_available return _torchaudio_available
def is_phonemizer_available():
return _phonemizer_available
def torch_only_method(fn): def torch_only_method(fn):
def wrapper(*args, **kwargs): def wrapper(*args, **kwargs):
if not _torch_available: if not _torch_available:
...@@ -728,6 +740,13 @@ explained here: https://pandas.pydata.org/pandas-docs/stable/getting_started/ins ...@@ -728,6 +740,13 @@ explained here: https://pandas.pydata.org/pandas-docs/stable/getting_started/ins
""" """
# docstyle-ignore
PHONEMIZER_IMPORT_ERROR = """
{0} requires the phonemizer library but it was not found in your environment. You can install it with pip:
`pip install phonemizer`
"""
# docstyle-ignore # docstyle-ignore
SCIPY_IMPORT_ERROR = """ SCIPY_IMPORT_ERROR = """
{0} requires the scipy library but it was not found in your environment. You can install it with pip: {0} requires the scipy library but it was not found in your environment. You can install it with pip:
...@@ -774,6 +793,7 @@ BACKENDS_MAPPING = OrderedDict( ...@@ -774,6 +793,7 @@ BACKENDS_MAPPING = OrderedDict(
("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)), ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
("flax", (is_flax_available, FLAX_IMPORT_ERROR)), ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)), ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)), ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
("pyctcdecode", (is_pyctcdecode_available, PYCTCDECODE_IMPORT_ERROR)), ("pyctcdecode", (is_pyctcdecode_available, PYCTCDECODE_IMPORT_ERROR)),
("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)), ("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)),
......
...@@ -107,6 +107,7 @@ from . import ( ...@@ -107,6 +107,7 @@ from . import (
visual_bert, visual_bert,
vit, vit,
wav2vec2, wav2vec2,
wav2vec2_phoneme,
wav2vec2_with_lm, wav2vec2_with_lm,
wavlm, wavlm,
xlm, xlm,
......
...@@ -223,6 +223,7 @@ else: ...@@ -223,6 +223,7 @@ else:
"CLIPTokenizerFast" if is_tokenizers_available() else None, "CLIPTokenizerFast" if is_tokenizers_available() else None,
), ),
), ),
("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
( (
"perceiver", "perceiver",
( (
......
...@@ -16,11 +16,22 @@ ...@@ -16,11 +16,22 @@
import argparse import argparse
import json
import os
import fairseq import fairseq
import torch import torch
from fairseq.data import Dictionary
from transformers import UniSpeechConfig, UniSpeechForPreTraining, logging from transformers import (
UniSpeechConfig,
UniSpeechForCTC,
UniSpeechForPreTraining,
Wav2Vec2FeatureExtractor,
Wav2Vec2PhonemeCTCTokenizer,
Wav2Vec2Processor,
logging,
)
logging.set_verbosity_info() logging.set_verbosity_info()
...@@ -55,8 +66,17 @@ TOP_LEVEL_KEYS = [ ...@@ -55,8 +66,17 @@ TOP_LEVEL_KEYS = [
] ]
def set_recursively(hf_pointer, key, value, full_name, weight_type): def set_recursively(hf_pointer, key, value, full_name, weight_type, is_finetuned):
for attribute in key.split("."): for attribute in key.split("."):
if is_finetuned:
if attribute in ["quantizer", "project_q", "project_hid"]:
# those layers are only relevant for pretraining and should be dropped
return
if attribute == "ctc_proj":
# we should rename `ctc_proj` to `lm_head` for fine-tuned phoneme models
attribute = "lm_head"
hf_pointer = getattr(hf_pointer, attribute) hf_pointer = getattr(hf_pointer, attribute)
if weight_type is not None: if weight_type is not None:
...@@ -82,7 +102,7 @@ def set_recursively(hf_pointer, key, value, full_name, weight_type): ...@@ -82,7 +102,7 @@ def set_recursively(hf_pointer, key, value, full_name, weight_type):
logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.") logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
def recursively_load_weights(fairseq_model, hf_model): def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
unused_weights = [] unused_weights = []
fairseq_dict = fairseq_model.state_dict() fairseq_dict = fairseq_model.state_dict()
...@@ -118,7 +138,7 @@ def recursively_load_weights(fairseq_model, hf_model): ...@@ -118,7 +138,7 @@ def recursively_load_weights(fairseq_model, hf_model):
weight_type = "weight" weight_type = "weight"
else: else:
weight_type = None weight_type = None
set_recursively(hf_model, mapped_key, value, name, weight_type) set_recursively(hf_model, mapped_key, value, name, weight_type, is_finetuned)
continue continue
if not is_used: if not is_used:
unused_weights.append(name) unused_weights.append(name)
...@@ -163,7 +183,9 @@ def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_gro ...@@ -163,7 +183,9 @@ def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_gro
@torch.no_grad() @torch.no_grad()
def convert_unispeech_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None): def convert_unispeech_checkpoint(
checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
""" """
Copy/paste/tweak model's weights to transformers design. Copy/paste/tweak model's weights to transformers design.
""" """
...@@ -172,12 +194,62 @@ def convert_unispeech_checkpoint(checkpoint_path, pytorch_dump_folder_path, conf ...@@ -172,12 +194,62 @@ def convert_unispeech_checkpoint(checkpoint_path, pytorch_dump_folder_path, conf
else: else:
config = UniSpeechConfig() config = UniSpeechConfig()
if is_finetuned:
if dict_path:
target_dict = Dictionary.load_from_json(dict_path)
# important change bos & pad token id since CTC symbol is <pad> and
# not <s> as in fairseq
config.bos_token_id = target_dict.pad_index
config.pad_token_id = target_dict.bos_index
config.eos_token_id = target_dict.eos_index
config.vocab_size = len(target_dict.symbols)
vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
if not os.path.isdir(pytorch_dump_folder_path):
logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
return
os.makedirs(pytorch_dump_folder_path, exist_ok=True)
vocab_dict = target_dict.indices
# fairseq has the <pad> and <s> switched
vocab_dict["<pad>"] = 42
vocab_dict["<s>"] = 43
with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
json.dump(vocab_dict, vocab_handle)
tokenizer = Wav2Vec2PhonemeCTCTokenizer(
vocab_path,
unk_token=target_dict.unk_word,
pad_token=target_dict.pad_word,
bos_token=target_dict.bos_word,
eos_token=target_dict.eos_word,
word_delimiter_token="|",
do_lower_case=False,
)
return_attention_mask = True if config.feat_extract_norm == "layer" else False
feature_extractor = Wav2Vec2FeatureExtractor(
feature_size=1,
sampling_rate=16000,
padding_value=0,
do_normalize=True,
return_attention_mask=return_attention_mask,
)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained(pytorch_dump_folder_path)
hf_unispeech = UniSpeechForCTC(config)
else:
hf_unispeech = UniSpeechForPreTraining(config) hf_unispeech = UniSpeechForPreTraining(config)
if is_finetuned:
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
[checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1]), "w2v_path": checkpoint_path}
)
else:
model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path]) model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
model = model[0].eval() model = model[0].eval()
recursively_load_weights(model, hf_unispeech) recursively_load_weights(model, hf_unispeech, is_finetuned)
hf_unispeech.save_pretrained(pytorch_dump_folder_path) hf_unispeech.save_pretrained(pytorch_dump_folder_path)
...@@ -186,6 +258,12 @@ if __name__ == "__main__": ...@@ -186,6 +258,12 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint") parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
parser.add_argument(
"--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
)
args = parser.parse_args() args = parser.parse_args()
convert_unispeech_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path) convert_unispeech_checkpoint(
args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
)
...@@ -205,8 +205,13 @@ def convert_wav2vec2_checkpoint( ...@@ -205,8 +205,13 @@ def convert_wav2vec2_checkpoint(
logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path)) logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
return return
os.makedirs(pytorch_dump_folder_path, exist_ok=True) os.makedirs(pytorch_dump_folder_path, exist_ok=True)
vocab_dict = target_dict.indices
# fairseq has the <pad> and <s> switched
vocab_dict["<pad>"] = 0
vocab_dict["<s>"] = 1
with open(vocab_path, "w", encoding="utf-8") as vocab_handle: with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
json.dump(target_dict.indices, vocab_handle) json.dump(vocab_dict, vocab_handle)
tokenizer = Wav2Vec2CTCTokenizer( tokenizer = Wav2Vec2CTCTokenizer(
vocab_path, vocab_path,
unk_token=target_dict.unk_word, unk_token=target_dict.unk_word,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment