Unverified Commit a0e69a93 authored by Zineng Tang's avatar Zineng Tang Committed by GitHub
Browse files

Add TVLT (#20725)



* Update image_processing_tvlt.py

* Update modeling_tvlt.py

* Update

* Update modeling_tvlt.py

* Create tvlt.mdx

* Update configuration_tvlt.py

* Update modeling_tvlt.py

* Update test_modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update image_processing_tvlt.py

* Update feature_extraction_tvlt.py

* Update tvlt models

* Update tests

* Update

* Update

* Update tests

* Update README_ko.md

* Update README_ja.md

* Update README_ko.md

* Update README_zh-hans.md

* Update docs/source/en/model_doc/tvlt.mdx
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update docs/source/en/model_doc/tvlt.mdx
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update tvlt.mdx

* Update modeling_tvlt.py

* Update configuration_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Add files via upload

* Update model

* Update modeling_tvlt.py

* Update tvlt models

* Update src/transformers/models/tvlt/__init__.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/__init__.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add files via upload

* Add files via upload

* Delete modeling_tvlt.py

* Delete feature_extraction_tvlt.py

* Delete configuration_tvlt.py

* Delete image_processing_tvlt.py

* Delete processing_tvlt.py

* Update tvlt

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update README.md
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update README_es.md

* Update README_hd.md

* Update README_ja.md

* Update README_ko.md

* Update README_zh-hans.md

* Update README_zh-hant.md

* Update index.mdx

* Update tvlt.mdx

* Update tvlt.mdx

* Update configuration_tvlt.py

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update modeling_tvlt.py

* Add files via upload

* Update tvlt.mdx

* Update modeling_auto.py

* Add files via upload

* Add files via upload

* Update dummy_pt_objects.py

* Update __init__.py

* Update feature_extraction_tvlt.py

* Update feature_extraction_tvlt.py

* Update image_processing_tvlt.py

* Update modeling_auto.py

* Update test_feature_extraction_tvlt.py

* Update test_processor_tvlt.py

* Update test_feature_extraction_tvlt.py

* Add files via upload

* Update test_image_processor_tvlt.py

* Update tests/models/tvlt/test_processor_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/processing_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_image_processor_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update tests/models/tvlt/test_image_processor_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_image_processor_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_image_processor_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_feature_extraction_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/processing_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update docs/source/en/model_doc/tvlt.mdx
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/feature_extraction_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/feature_extraction_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/feature_extraction_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/feature_extraction_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update feature_extraction_tvlt.py

* Update feature_extraction_tvlt.py

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update image_processing_tvlt.py

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update test_image_processor_tvlt.py

* Update tests/models/tvlt/test_modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/tvlt/test_modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Add files via upload

* Add files via upload

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Add files via upload

* Update docs/source/en/model_doc/tvlt.mdx
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update image_processing_tvlt.py

* Add files via upload

* Add files via upload

* Update tvlt.mdx

* Update docs/source/en/model_doc/tvlt.mdx
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update docs/source/en/model_doc/tvlt.mdx
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>

* Update docs/source/en/model_doc/tvlt.mdx
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Update docs/source/en/model_doc/tvlt.mdx
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Add files via upload

* Add files via upload

* Add files via upload

* Add files via upload

* Update modeling_auto.py

* Update tvlt.mdx

* Update dummy_pt_objects.py

* Update feature_extraction_tvlt.py

* Update modeling_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update test_image_processor_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update modeling_tvlt.py

* Update dummy_pt_objects.py

* Update dummy_speech_objects.py

* Add files via upload

* Update README_hd.md

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update test_modeling_tvlt.py

* Update src/transformers/models/tvlt/configuration_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/feature_extraction_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/image_processing_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update MAE processing

* Update modeling_tvlt.py

* Update modeling_tvlt.py

* Update modeling

* Update style

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/models/tvlt/modeling_tvlt.py
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update check_repo.py

* Update tvlt.mdx

* Update __init__.py

* Update tests

* Update tvlt models

* Update configuration_tvlt.py

* Update configuration_tvlt.py

* Update image_processing_tvlt.py

* Update dummy_pt_objects.py

* Add files via upload

* Update test_modeling_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update test_feature_extraction_tvlt.py

* Update test_feature_extraction_tvlt.py

---------
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
parent 7bac5183
......@@ -421,6 +421,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
......
......@@ -414,6 +414,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
......
......@@ -386,6 +386,7 @@ conda install -c huggingface transformers
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU की ओर से) कागज के साथ [संस्करण-एक्स: एक ब्लॉग मॉडल चौकस चौक मॉडल मॉडल] (https://arxivorg/abs/1901.02860) क्वोकोक वी. ले, रुस्लैन सलाखुतदी
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (माइक्रोसॉफ्ट रिसर्च से) साथ में दिया गया पेपर [UniSpeech: यूनिफाइड स्पीच रिप्रेजेंटेशन लर्निंग विद लेबलेड एंड अनलेबल्ड डेटा](https:/ /arxiv.org/abs/2101.07597) चेंगई वांग, यू वू, याओ कियान, केनिची कुमातानी, शुजी लियू, फुरु वेई, माइकल ज़ेंग, ज़ुएदोंग हुआंग द्वारा।
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [UNISPEECH-SAT: यूनिवर्सल स्पीच रिप्रेजेंटेशन लर्निंग विद स्पीकर अवेयर प्री-ट्रेनिंग ](https://arxiv.org/abs/2110.05752) सानयुआन चेन, यू वू, चेंग्यी वांग, झेंगयांग चेन, झूओ चेन, शुजी लियू, जियान वू, याओ कियान, फुरु वेई, जिन्यु ली, जियांगज़ान यू द्वारा पोस्ट किया गया।
......
......@@ -448,6 +448,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley から) Michael Janner, Qiyang Li, Sergey Levine から公開された研究論文: [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU から) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov から公開された研究論文: [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft から), Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei から公開された研究論文: [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282)
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill から), Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal から公開された研究論文: [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research から) Yi Tay, Mostafa Dehghani, Vinh Q から公開された研究論文: [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research から) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang から公開された研究論文: [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597)
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research から) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu から公開された研究論文: [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752)
......
......@@ -363,6 +363,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (the University of California at Berkeley 에서) Michael Janner, Qiyang Li, Sergey Levin 의 [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) 논문과 함께 발표했습니다.
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU 에서) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 의 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 논문과 함께 발표했습니다.
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft 에서) Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 의 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 논문과 함께 발표했습니다.
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill 에서) Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 의 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 논문과 함께 발표했습니다.
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research 에서) Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzle 의 [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) 논문과 함께 발표했습니다.
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research 에서) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 의 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 논문과 함께 발표했습니다.
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research 에서) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 의 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 논문과 함께 발표했습니다.
......
......@@ -387,6 +387,7 @@ conda install -c huggingface transformers
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (来自 UNC Chapel Hill) 伴随论文 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 由 Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 发布。
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
......
......@@ -399,6 +399,7 @@ conda install -c huggingface transformers
1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
1. **[TVLT](https://huggingface.co/docs/transformers/main/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
......
......@@ -576,6 +576,8 @@
title: TAPAS
- local: model_doc/trocr
title: TrOCR
- local: model_doc/tvlt
title: TVLT
- local: model_doc/vilt
title: ViLT
- local: model_doc/vision-encoder-decoder
......
......@@ -200,6 +200,7 @@ The documentation is organized into five sections:
1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
1. **[TVLT](model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
......@@ -382,6 +383,7 @@ Flax), PyTorch, and/or TensorFlow.
| Trajectory Transformer | ❌ | ❌ | ✅ | ❌ | ❌ |
| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ |
| TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ |
| TVLT | ❌ | ❌ | ✅ | ❌ | ❌ |
| UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ |
| UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ |
| UPerNet | ❌ | ❌ | ✅ | ❌ | ❌ |
......@@ -410,4 +412,4 @@ Flax), PyTorch, and/or TensorFlow.
| YOLOS | ❌ | ❌ | ✅ | ❌ | ❌ |
| YOSO | ❌ | ❌ | ✅ | ❌ | ❌ |
<!-- End table-->
\ No newline at end of file
<!-- End table-->
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
# TVLT
## Overview
The TVLT model was proposed in [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal (the first three authors contributed equally). The Textless Vision-Language Transformer (TVLT) is a model that uses raw visual and audio inputs for vision-and-language representation learning, without using text-specific modules such as tokenization or automatic speech recognition (ASR). It can perform various audiovisual and vision-language tasks like retrieval, question answering, etc.
The abstract from the paper is the following:
*In this work, we present the Textless Vision-Language Transformer (TVLT), where homogeneous transformer blocks take raw visual and audio inputs for vision-and-language representation learning with minimal modality-specific design, and do not use text-specific modules such as tokenization or automatic speech recognition (ASR). TVLT is trained by reconstructing masked patches of continuous video frames and audio spectrograms (masked autoencoding) and contrastive modeling to align video and audio. TVLT attains performance comparable to its text-based counterpart on various multimodal tasks, such as visual question answering, image retrieval, video retrieval, and multimodal sentiment analysis, with 28x faster inference speed and only 1/3 of the parameters. Our findings suggest the possibility of learning compact and efficient visual-linguistic representations from low-level visual and audio signals without assuming the prior existence of text.*
Tips:
- TVLT is a model that takes both `pixel_values` and `audio_values` as input. One can use [`TvltProcessor`] to prepare data for the model.
This processor wraps an image processor (for the image/video modality) and an audio feature extractor (for the audio modality) into one.
- TVLT is trained with images/videos and audios of various sizes: the authors resize and crop the input images/videos to 224 and limit the length of audio spectrogram to 2048. To make batching of videos and audios possible, the authors use a `pixel_mask` that indicates which pixels are real/padding and `audio_mask` that indicates which audio values are real/padding.
- The design of TVLT is very similar to that of a standard Vision Transformer (ViT) and masked autoencoder (MAE) as in [ViTMAE](vitmae). The difference is that the model includes embedding layers for the audio modality.
- The PyTorch version of this model is only available in torch 1.10 and higher.
<p align="center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/tvlt_architecture.png"
alt="drawing" width="600"/>
</p>
<small> TVLT architecture. Taken from the <a href="[https://arxiv.org/abs/2102.03334](https://arxiv.org/abs/2209.14156)">original paper</a>. </small>
The original code can be found [here](https://github.com/zinengtang/TVLT). This model was contributed by [Zineng Tang](https://huggingface.co/ZinengTang).
## TvltConfig
[[autodoc]] TvltConfig
## TvltProcessor
[[autodoc]] TvltProcessor
- __call__
## TvltImageProcessor
[[autodoc]] TvltImageProcessor
- preprocess
## TvltFeatureExtractor
[[autodoc]] TvltFeatureExtractor
- __call__
## TvltModel
[[autodoc]] TvltModel
- forward
## TvltForPreTraining
[[autodoc]] TvltForPreTraining
- forward
## TvltForAudioVisualClassification
[[autodoc]] TvltForAudioVisualClassification
- forward
......@@ -444,6 +444,11 @@ _import_structure = {
"TrOCRConfig",
"TrOCRProcessor",
],
"models.tvlt": [
"TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP",
"TvltConfig",
"TvltProcessor",
],
"models.unispeech": [
"UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP",
"UniSpeechConfig",
......@@ -750,6 +755,7 @@ else:
_import_structure["models.mctct"].append("MCTCTFeatureExtractor")
_import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
_import_structure["models.speecht5"].append("SpeechT5FeatureExtractor")
_import_structure["models.tvlt"].append("TvltFeatureExtractor")
# Tensorflow-text-specific objects
try:
......@@ -826,6 +832,7 @@ else:
_import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
_import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
_import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
_import_structure["models.tvlt"].append("TvltImageProcessor")
_import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
_import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
_import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
......@@ -2348,6 +2355,15 @@ else:
_import_structure["models.trocr"].extend(
["TROCR_PRETRAINED_MODEL_ARCHIVE_LIST", "TrOCRForCausalLM", "TrOCRPreTrainedModel"]
)
_import_structure["models.tvlt"].extend(
[
"TVLT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TvltForAudioVisualClassification",
"TvltForPreTraining",
"TvltModel",
"TvltPreTrainedModel",
]
)
_import_structure["models.unispeech"].extend(
[
"UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST",
......@@ -3937,6 +3953,7 @@ if TYPE_CHECKING:
TransfoXLTokenizer,
)
from .models.trocr import TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP, TrOCRConfig, TrOCRProcessor
from .models.tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig, TvltProcessor
from .models.unispeech import UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechConfig
from .models.unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig
from .models.upernet import UperNetConfig
......@@ -4212,6 +4229,7 @@ if TYPE_CHECKING:
from .models.mctct import MCTCTFeatureExtractor
from .models.speech_to_text import Speech2TextFeatureExtractor
from .models.speecht5 import SpeechT5FeatureExtractor
from .models.tvlt import TvltFeatureExtractor
try:
if not is_tensorflow_text_available():
......@@ -4269,6 +4287,7 @@ if TYPE_CHECKING:
from .models.poolformer import PoolFormerFeatureExtractor, PoolFormerImageProcessor
from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
from .models.swin2sr import Swin2SRImageProcessor
from .models.tvlt import TvltImageProcessor
from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
from .models.vit import ViTFeatureExtractor, ViTImageProcessor
......@@ -5510,6 +5529,13 @@ if TYPE_CHECKING:
load_tf_weights_in_transfo_xl,
)
from .models.trocr import TROCR_PRETRAINED_MODEL_ARCHIVE_LIST, TrOCRForCausalLM, TrOCRPreTrainedModel
from .models.tvlt import (
TVLT_PRETRAINED_MODEL_ARCHIVE_LIST,
TvltForAudioVisualClassification,
TvltForPreTraining,
TvltModel,
TvltPreTrainedModel,
)
from .models.unispeech import (
UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST,
UniSpeechForCTC,
......
......@@ -169,6 +169,7 @@ from . import (
trajectory_transformer,
transfo_xl,
trocr,
tvlt,
unispeech,
unispeech_sat,
upernet,
......
......@@ -169,6 +169,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("trajectory_transformer", "TrajectoryTransformerConfig"),
("transfo-xl", "TransfoXLConfig"),
("trocr", "TrOCRConfig"),
("tvlt", "TvltConfig"),
("unispeech", "UniSpeechConfig"),
("unispeech-sat", "UniSpeechSatConfig"),
("upernet", "UperNetConfig"),
......@@ -329,6 +330,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
("time_series_transformer", "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("timesformer", "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("tvlt", "TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("unispeech", "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("unispeech-sat", "UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("van", "VAN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
......@@ -516,6 +518,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("trajectory_transformer", "Trajectory Transformer"),
("transfo-xl", "Transformer-XL"),
("trocr", "TrOCR"),
("tvlt", "TVLT"),
("ul2", "UL2"),
("unispeech", "UniSpeech"),
("unispeech-sat", "UniSpeechSat"),
......
......@@ -163,6 +163,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("timesformer", "TimesformerModel"),
("trajectory_transformer", "TrajectoryTransformerModel"),
("transfo-xl", "TransfoXLModel"),
("tvlt", "TvltModel"),
("unispeech", "UniSpeechModel"),
("unispeech-sat", "UniSpeechSatModel"),
("van", "VanModel"),
......@@ -235,6 +236,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("t5", "T5ForConditionalGeneration"),
("tapas", "TapasForMaskedLM"),
("transfo-xl", "TransfoXLLMHeadModel"),
("tvlt", "TvltForPreTraining"),
("unispeech", "UniSpeechForPreTraining"),
("unispeech-sat", "UniSpeechSatForPreTraining"),
("videomae", "VideoMAEForPreTraining"),
......
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_speech_available,
is_torch_available,
is_vision_available,
)
_import_structure = {
"configuration_tvlt": ["TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP", "TvltConfig"],
"processing_tvlt": ["TvltProcessor"],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_tvlt"] = [
"TVLT_PRETRAINED_MODEL_ARCHIVE_LIST",
"TvltModel",
"TvltForPreTraining",
"TvltForAudioVisualClassification",
"TvltPreTrainedModel",
]
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_tvlt"] = ["TvltImageProcessor"]
try:
if not is_speech_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["feature_extraction_tvlt"] = ["TvltFeatureExtractor"]
if TYPE_CHECKING:
from .configuration_tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig
from .processing_tvlt import TvltProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_tvlt import (
TVLT_PRETRAINED_MODEL_ARCHIVE_LIST,
TvltForAudioVisualClassification,
TvltForPreTraining,
TvltModel,
TvltPreTrainedModel,
)
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_tvlt import TvltImageProcessor
try:
if not is_speech_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .feature_extraction_tvlt import TvltFeatureExtractor
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
# coding=utf-8
# Copyright 2023 MURGe-Lab and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TVLT model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"ZinengTang/tvlt-base": "https://huggingface.co/ZinengTang/tvlt-base/blob/main/config.json",
}
class TvltConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`TvltModel`]. It is used to instantiate a TVLT
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the TVLT
[TVLT/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
spectrogram_length (`int`, *optional*, defaults to 2048):
The time length of each audio spectrogram.
frequency_length (`int`, *optional*, defaults to 128):
The frequency length of audio spectrogram.
image_patch_size (`List[int]`, *optional*, defaults to `[16, 16]`):
The size (resolution) of each image patch.
audio_patch_size (`List[int]`, *optional*, defaults to `[16, 16]`):
The size (resolution) of each audio patch.
num_image_channels (`int`, *optional*, defaults to 3):
The number of input image channels.
num_audio_channels (`int`, *optional*, defaults to 1):
The number of input audio channels.
num_frames (`int`, *optional*, defaults to 8):
The maximum number of frames for an input video.
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
The epsilon used by the layer normalization layers.
qkv_bias (`bool`, *optional*, defaults to `True`):
Whether to add a bias to the queries, keys and values.
use_mean_pooling (`bool`, *optional*, defaults to `False`):
Whether to mean pool the final hidden states instead of using the final hidden state of the [CLS] token.
decoder_num_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the decoder.
decoder_hidden_size (`int`, *optional*, defaults to 512):
Dimensionality of the decoder.
decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
Number of hidden layers in the decoder.
decoder_intermediate_size (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
pixel_mask_ratio (`float`, *optional*, defaults to 0.75):
Image patch masking ratio.
audio_mask_ratio (`float`, *optional*, defaults to 0.15):
Audio patch masking ratio.
audio_mask_type (`str`, *optional*, defaults to `"frame-level"`):
Audio patch masking type, choose between "frame-level" and "patch-level".
task_matching (`bool`, *optional*, defaults to `True`):
Whether to use vision audio matching task in pretraining.
task_mae (`bool`, *optional*, defaults to `True`):
Whether to use the masked auto-encoder (MAE) in pretraining.
loss_type (`str`, *optional*, defaults to `"classification"`):
Loss types including regression and classification.
Example:
```python
>>> from transformers import TvltConfig, TvltModel
>>> # # Initializing a TVLT ZinengTang/tvlt-base style configuration
>>> configuration = TvltConfig()
>>> # # Initializing a model (with random weights) from the ZinengTang/tvlt-base style configuration
>>> model = TvltModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "tvlt"
def __init__(
self,
image_size=224,
spectrogram_length=2048,
frequency_length=128,
image_patch_size=[16, 16],
audio_patch_size=[16, 16],
num_image_channels=3,
num_audio_channels=1,
num_frames=8,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
layer_norm_eps=1e-6,
qkv_bias=True,
use_mean_pooling=False,
decoder_num_attention_heads=16,
decoder_hidden_size=512,
decoder_num_hidden_layers=8,
decoder_intermediate_size=2048,
pixel_mask_ratio=0.75,
audio_mask_ratio=0.15,
audio_mask_type="frame-level",
task_matching=True,
task_mae=True,
loss_type="classification",
**kwargs,
):
super().__init__(**kwargs)
if audio_mask_type not in ("frame-level", "patch_level"):
raise ValueError(
"audio_mask_type must be one of two acceptable strategies - {'frame_level', 'patch-level') "
f"got {audio_mask_type}"
)
self.image_size = image_size
self.spectrogram_length = spectrogram_length
self.frequency_length = frequency_length
self.image_patch_size = image_patch_size
self.audio_patch_size = audio_patch_size
self.num_image_channels = num_image_channels
self.num_audio_channels = num_audio_channels
self.num_frames = num_frames
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.qkv_bias = qkv_bias
self.use_mean_pooling = use_mean_pooling
self.decoder_num_attention_heads = decoder_num_attention_heads
self.decoder_hidden_size = decoder_hidden_size
self.decoder_num_hidden_layers = decoder_num_hidden_layers
self.decoder_intermediate_size = decoder_intermediate_size
self.pixel_mask_ratio = pixel_mask_ratio
self.audio_mask_ratio = audio_mask_ratio
self.audio_mask_type = audio_mask_type
self.task_matching = task_matching
self.task_mae = task_mae
self.loss_type = loss_type
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Feature extractor class for TVLT."""
from math import ceil
from typing import List, Optional, Union
import numpy as np
from numpy.fft import fft
from transformers.feature_extraction_sequence_utils import BatchFeature, SequenceFeatureExtractor
from transformers.utils import TensorType, logging
logger = logging.get_logger(__name__)
class TvltFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a TVLT audio feature extractor. This feature extractor can be used to prepare audios for the model.
This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods.
Args:
spectrogram_length (`Dict[str, int]` *optional*, defaults to 2048):
The time length of each audio spectrogram.
num_channels (`int` *optional*, defaults to 1):
Number of audio channels.
patch_size (`List[int]` *optional*, defaults to `[16, 16]`):
The patch size of audio patch embedding.
feature_size (`int`, defaults to 128):
The frequency length of audio spectrogram.
sampling_rate (`int`, defaults to 44100):
The sampling rate at which the audio files should be digitalized expressed in Hertz (Hz).
hop_length_to_sampling_rate (`int`, defaults to 86):
Hop length is length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
For example, with sampling rate 44100, the hop length is 512, with 44100 / 512 = 86
n_fft (`int`, defaults to 2048):
Size of the Fourier transform.
padding_value (`float`, *optional*, defaults to 0.0):
Padding value used to pad the audio. Should correspond to silences.
"""
model_input_names = ["audio_values", "audio_mask"]
def __init__(
self,
spectrogram_length=2048,
num_channels=1,
patch_size=[16, 16],
feature_size=128,
sampling_rate=44100,
hop_length_to_sampling_rate=86,
n_fft=2048,
padding_value=0.0,
**kwargs,
):
super().__init__(
feature_size=feature_size,
sampling_rate=sampling_rate,
padding_value=padding_value,
**kwargs,
)
self.spectrogram_length = spectrogram_length
self.num_channels = num_channels
self.patch_size = patch_size
self.freq_len = feature_size // self.patch_size[1]
self.n_fft = n_fft
self.hop_length = sampling_rate // hop_length_to_sampling_rate
self.sampling_rate = sampling_rate
self.padding_value = padding_value
self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=feature_size)
# Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.get_mel_filters with 45.245640471924965->59.99247463746737
def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
# Initialize the weights
n_mels = int(n_mels)
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
# Center freqs of each FFT bin
fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = 0.0
max_mel = 59.99247463746737
mels = np.linspace(min_mel, max_mel, n_mels + 2)
mels = np.asanyarray(mels)
# Fill in the linear scale
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mels
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region
# If we have vector data, vectorize
log_t = mels >= min_log_mel
freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
mel_f = freqs
fdiff = np.diff(mel_f)
ramps = np.subtract.outer(mel_f, fftfreqs)
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]
# .. then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
weights *= enorm[:, np.newaxis]
return weights
# Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.fram_wave
def fram_wave(self, waveform, center=True):
"""
Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is
contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each
new frame.
Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`.
"""
frames = []
for i in range(0, waveform.shape[0] + 1, self.hop_length):
half_window = (self.n_fft - 1) // 2 + 1
if center:
start = i - half_window if i > half_window else 0
end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
frame = waveform[start:end]
if start == 0:
padd_width = (-i + half_window, 0)
frame = np.pad(frame, pad_width=padd_width, mode="reflect")
elif end == waveform.shape[0]:
padd_width = (0, (i - waveform.shape[0] + half_window))
frame = np.pad(frame, pad_width=padd_width, mode="reflect")
else:
frame = waveform[i : i + self.n_fft]
frame_width = frame.shape[0]
if frame_width < waveform.shape[0]:
frame = np.lib.pad(
frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0
)
frames.append(frame)
return np.stack(frames, 0)
# Copied from transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor.stft
def stft(self, frames, window):
"""
Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
results as `torch.stft`.
"""
frame_size = frames.shape[1]
fft_size = self.n_fft
if fft_size is None:
fft_size = frame_size
if fft_size < frame_size:
raise ValueError("FFT size must greater or equal the frame size")
# number of FFT bins to store
num_fft_bins = (fft_size >> 1) + 1
data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
fft_signal = np.zeros(fft_size)
for f, frame in enumerate(frames):
if window is not None:
np.multiply(frame, window, out=fft_signal[:frame_size])
else:
fft_signal[:frame_size] = frame
data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
return data.T
def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
"""
Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
implementation with 1e-5 tolerance.
"""
window = np.hanning(self.n_fft + 1)[:-1]
frames = self.fram_wave(waveform)
stft = self.stft(frames, window=window)
magnitudes = np.abs(stft[:, :-1]) ** 2
filters = self.mel_filters
mel_spec = filters @ magnitudes
log_spec = 10.0 * np.log10(np.maximum(1e-10, mel_spec))
log_spec -= 10.0 * np.log10(np.maximum(1e-10, 1.0))
log_spec = np.maximum(log_spec, log_spec.max() - 80.0)
log_spec = log_spec - 20.0
log_spec = np.clip(log_spec / 40.0, -2.0, 0.0) + 1.0
return log_spec
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
return_tensors: Optional[Union[str, TensorType]] = None,
return_attention_mask: Optional[bool] = True,
sampling_rate: Optional[int] = None,
resample: bool = False,
mask_audio: bool = False,
**kwargs,
) -> BatchFeature:
"""
Main method to prepare one or several audio(s) for the model.
Args:
raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
values, a list of numpy arrays or a list of list of float values.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
return_attention_mask (`bool`, *optional*, default to `True`):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific feature_extractor's default. [What are attention masks?](../glossary#attention-mask)
<Tip>
For TvltTransformer models, `attention_mask` should alwys be passed for batched inference, to avoid
subtle bugs.
</Tip>
sampling_rate (`int`, *optional*):
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
`sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
pipeline. Current model supports sampling rate 16000 and 44100.
resample (`bool`, *optional*, defaults to `False`):
If the sampling rate is not matched, resample the input audio to match.
mask_audio (`bool`, *optional*, defaults to `False`):
Whether or not to mask input audio for MAE task.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **audio_values** -- Audio values to be fed to a model, of shape (batch_size, num_channels, height,
width).
- **audio_mask** -- Audio masks to be fed to a model, of shape (batch_size, num_audio_patches).
"""
if sampling_rate is not None:
if sampling_rate != self.sampling_rate:
raise ValueError(
"This feature extractor is set to support sampling rate"
f" of {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled"
f" with {self.sampling_rate} and not {sampling_rate}."
)
else:
logger.warning(
"It is strongly recommended to pass the `sampling_rate` argument to this function. "
"Failing to do so can result in silent errors that might be hard to debug."
)
is_batched = bool(
isinstance(raw_speech, (list, tuple))
and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
)
if is_batched:
raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
elif not is_batched and not isinstance(raw_speech, np.ndarray):
raw_speech = np.asarray(raw_speech, dtype=np.float32)
elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
raw_speech = raw_speech.astype(np.float32)
# always return batch
if not is_batched:
raw_speech = [np.asarray([raw_speech]).T]
# Convert audio signals to log mel spectrograms, truncate by time axis
audio_features = [
self._np_extract_fbank_features(waveform.squeeze()).T[: self.spectrogram_length] for waveform in raw_speech
]
if isinstance(audio_features[0], List):
audio_features = [np.asarray(feature, dtype=np.float32) for feature in audio_features]
# Create audio attention mask
max_patch_len = max(
[ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len for feature in audio_features]
) # The maximum number of audio patches in a batch
if return_attention_mask:
audio_mask = [
(ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [1]
+ (max_patch_len - ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [0]
for feature in audio_features
]
audio_mask = np.array(audio_mask).astype(np.float32)
# convert into correct format for padding
max_time_len = max_patch_len // self.freq_len * self.patch_size[0] # The maximum audio size in a batch
padded_audio_features = np.ones([len(audio_features), 1, max_time_len, self.feature_size]).astype(np.float32)
padded_audio_features = padded_audio_features * self.padding_value
for i in range(len(audio_features)):
feature = audio_features[i]
padded_audio_features[i, :, : feature.shape[0], :] = feature
# return as BatchFeature
if return_attention_mask:
data = {"audio_values": padded_audio_features, "audio_mask": audio_mask}
else:
data = {"audio_values": padded_audio_features}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
return encoded_inputs
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for TVLT."""
from typing import Dict, List, Optional, Union
import numpy as np
from transformers.utils.generic import TensorType
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
center_crop,
get_resize_output_image_size,
normalize,
rescale,
resize,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
is_valid_image,
to_numpy_array,
valid_images,
)
from ...utils import logging
logger = logging.get_logger(__name__)
def make_batched(videos) -> List[List[ImageInput]]:
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
videos_dim = np.array(videos[0]).ndim
if videos_dim == 3:
return [videos]
elif videos_dim == 4:
return videos
elif is_valid_image(videos):
videos_dim = np.array(videos).ndim
if videos_dim == 3:
return [[videos]]
elif videos_dim == 4:
return [videos]
elif videos_dim == 5:
return videos
raise ValueError(f"Could not make batched video from {videos}")
class TvltImageProcessor(BaseImageProcessor):
r"""
Constructs a TVLT image processor.
This processor can be used to prepare either videos or images for the model by converting images to 1-frame videos.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
`do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
Size of the output image after resizing. The shortest edge of the image will be resized to
`size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
`size` in the `preprocess` method.
patch_size (`List[int]` *optional*, defaults to [16,16]):
The patch size of image patch embedding.
num_frames (`int` *optional*, defaults to 8):
The maximum number of video frames.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
`preprocess` method.
do_center_crop (`bool`, *optional*, defaults to `True`):
Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
parameter in the `preprocess` method.
crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
`preprocess` method.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to 1/255):
Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
in the `preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
model_input_names = [
"pixel_values",
"pixel_mask",
"pixel_values_mixed",
"pixel_mask_mixed",
]
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
patch_size: List[int] = [16, 16],
num_frames: int = 8,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_center_crop: bool = True,
crop_size: Dict[str, int] = None,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = IMAGENET_STANDARD_MEAN,
image_std: Optional[Union[float, List[float]]] = IMAGENET_STANDARD_STD,
init_mask_generator=False,
**kwargs,
) -> None:
super().__init__(**kwargs)
size = size if size is not None else {"shortest_edge": 224}
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
crop_size = get_size_dict(crop_size, param_name="crop_size")
self.do_resize = do_resize
self.size = size
self.patch_size = patch_size
self.num_frames = num_frames
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
shortest edge of length `s` while keeping the aspect ratio of the original image.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use when resiizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size, default_to_square=False)
if "shortest_edge" in size:
output_size = get_resize_output_image_size(image, size["shortest_edge"], default_to_square=False)
elif "height" in size and "width" in size:
output_size = (size["height"], size["width"])
else:
raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
def center_crop(
self,
image: np.ndarray,
size: Dict[str, int],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `size` along any
edge, the image is padded with 0's and then center cropped.
Args:
image (`np.ndarray`):
Image to center crop.
size (`Dict[str, int]`):
Size of the output image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
size = get_size_dict(size)
if "height" not in size or "width" not in size:
raise ValueError(f"Size must have 'height' and 'width' as keys. Got {size.keys()}")
return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
def rescale(
self,
image: np.ndarray,
scale: Union[int, float],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
):
"""
Rescale an image by a scale factor. image = image * scale.
Args:
image (`np.ndarray`):
Image to rescale.
scale (`int` or `float`):
Scale to apply to the image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
return rescale(image, scale=scale, data_format=data_format, **kwargs)
# Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.normalize
def normalize(
self,
image: np.ndarray,
mean: Union[float, List[float]],
std: Union[float, List[float]],
data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Normalize an image. image = (image - image_mean) / image_std.
Args:
image (`np.ndarray`):
Image to normalize.
mean (`float` or `List[float]`):
Image mean to use for normalization.
std (`float` or `List[float]`):
Image standard deviation to use for normalization.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Returns:
`np.ndarray`: The normalized image.
"""
return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
def _preprocess_image(
self,
image: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
resample: PILImageResampling = None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
) -> np.ndarray:
"""Preprocesses a single image."""
if do_resize and size is None or resample is None:
raise ValueError("Size and resample must be specified if do_resize is True.")
if do_center_crop and crop_size is None:
raise ValueError("Crop size must be specified if do_center_crop is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_normalize and (image_mean is None or image_std is None):
raise ValueError("Image mean and std must be specified if do_normalize is True.")
# All transformations expect numpy arrays.
image = to_numpy_array(image)
if do_resize:
image = self.resize(image=image, size=size, resample=resample)
if do_center_crop:
image = self.center_crop(image, size=crop_size)
if do_rescale:
image = self.rescale(image=image, scale=rescale_factor)
if do_normalize:
image = self.normalize(image=image, mean=image_mean, std=image_std)
image = to_channel_dimension_format(image, data_format)
return image
def preprocess(
self,
videos: ImageInput,
do_resize: bool = None,
size: Dict[str, int] = None,
patch_size: List[int] = None,
num_frames: int = None,
resample: PILImageResampling = None,
do_center_crop: bool = None,
crop_size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
is_mixed: bool = False,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
**kwargs,
) -> BatchFeature:
"""
Preprocess an videos or image or batch of videos or images.
Args:
videos (`ImageInput`):
Images or videos to preprocess.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after applying resize.
patch_size (`List[int]` *optional*, defaults to self.patch_size):
The patch size of image patch embedding.
num_frames (`int` *optional*, defaults to self.num_frames):
The maximum number of video frames.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
has an effect if `do_resize` is set to `True`.
do_center_crop (`bool`, *optional*, defaults to `self.do_centre_crop`):
Whether to centre crop the image.
crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
Size of the image after applying the centre crop.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image values between [0 - 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation.
is_mixed (`bool`, *optional*):
If the input video has negative samples.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the inferred channel dimension format of the input image.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
width).
- **pixel_mask** -- Pixel masks to be fed to a model, of shape (batch_size, num_pixel_patches).
- **pixel_values_mixed** -- Pixel values with both postive or negative to be fed to a model, of shape
(batch_size, num_channels, height, width).
- **pixel_mask_mixed** -- Pixel masks with both postive or negative to be fed to a model, of shape
(batch_size, num_pixel_patches).
"""
do_resize = do_resize if do_resize is not None else self.do_resize
resample = resample if resample is not None else self.resample
do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
size = size if size is not None else self.size
size = get_size_dict(size, default_to_square=False)
crop_size = crop_size if crop_size is not None else self.crop_size
crop_size = get_size_dict(crop_size, param_name="crop_size")
patch_size = patch_size if patch_size is not None else self.patch_size
num_frames = num_frames if patch_size is not None else self.num_frames
if not valid_images(videos):
raise ValueError(
"Invalid image or video type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
videos = make_batched(videos)
# Check number of frames is fewer than maximum frames
for video in videos:
if len(video) > self.num_frames:
raise ValueError(
f"number of frames must not be greater than the maximum frames of the model {self.num_frames}."
)
max_num_frames = max([len(video) for video in videos])
num_patches_per_image = (size["shortest_edge"] // patch_size[0]) ** 2
video_masks = np.array(
[
len(video) * num_patches_per_image * [1] + (max_num_frames - len(video)) * num_patches_per_image * [0]
for video in videos
]
)
videos = [
[
self._preprocess_image(
image=img,
do_resize=do_resize,
size=size,
resample=resample,
do_center_crop=do_center_crop,
crop_size=crop_size,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
data_format=data_format,
)
for img in video
]
for video in videos
]
# If videos contain both positive/negative, use mixed key for video-audio matching task
if is_mixed:
data = {"pixel_values_mixed": videos, "pixel_mask_mixed": video_masks}
else:
data = {"pixel_values": videos, "pixel_mask": video_masks}
return BatchFeature(data=data, tensor_type=return_tensors)
# coding=utf-8
# Copyright 2023 MURGe-Lab and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch TVLT model."""
import collections.abc
import math
from copy import deepcopy
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, SequenceClassifierOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_tvlt import TvltConfig
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "TvltConfig"
_CHECKPOINT_FOR_DOC = "ZinengTang/tvlt-base"
TVLT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"ZinengTang/tvlt-base",
# See all TVLT models at https://huggingface.co/ZinengTang/tvlt-base
]
@dataclass
class TvltModelOutput(ModelOutput):
"""
Class for TvltModel's outputs, with potential hidden states and attentions.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
last_pixel_hidden_state (`torch.FloatTensor` of shape `(batch_size, pixel_sequence_length, hidden_size)`):
Pixel sequence of hidden-states at the output of the last layer of the model.
last_audio_hidden_state (`torch.FloatTensor` of shape `(batch_size, audio_sequence_length, hidden_size)`):
Audio sequence of hidden-states at the output of the last layer of the model.
pixel_label_masks (`torch.FloatTensor` of shape `(batch_size, pixel_patch_length)`):
Tensor indicating which pixel patches are masked (1) and which are not (0).
audio_label_masks (`torch.FloatTensor` of shape `(batch_size, audio_patch_length)`):
Tensor indicating which audio patches are masked (1) and which are not (0).
pixel_ids_restore (`torch.LongTensor` of shape `(batch_size, pixel_patch_length)`):
Tensor containing the ids permutation of pixel masking.
audio_ids_restore (`torch.LongTensor` of shape `(batch_size, audio_patch_length)`):
Tensor containing the ids permutation of audio masking.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
"""
last_hidden_state: torch.FloatTensor = None
last_pixel_hidden_state: torch.FloatTensor = None
last_audio_hidden_state: torch.FloatTensor = None
pixel_label_masks: torch.LongTensor = None
audio_label_masks: torch.LongTensor = None
pixel_ids_restore: torch.LongTensor = None
audio_ids_restore: torch.LongTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class TvltDecoderOutput(ModelOutput):
"""
Class for TvltDecoder's outputs, with potential hidden states and attentions.
Args:
logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
Pixel reconstruction logits.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
"""
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class TvltForPreTrainingOutput(ModelOutput):
"""
Class for TvltForPreTraining's outputs, with potential hidden states and attentions.
Args:
loss (`torch.FloatTensor` of shape `(1,)`):
Pixel reconstruction loss.
matching_logits (`torch.FloatTensor` of shape `(batch_size, 1)`):
Matching objective logits.
pixel_logits (`torch.FloatTensor` of shape
`(batch_size, pixel_patch_length, image_patch_size ** 3 * pixel_num_channels)`): Pixel reconstruction
logits.
audio_logits (`torch.FloatTensor` of shape
`(batch_size, audio_patch_length, image_patch_size[0] * image_patch_size[1])`): Audio reconstruction
logits.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
"""
loss: Optional[torch.FloatTensor] = None
matching_logits: torch.FloatTensor = None
pixel_logits: torch.FloatTensor = None
audio_logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
def generate_pixel_mask_noise(pixel_values, pixel_mask=None, mask_ratio=0.75):
"""Generate noise for audio masking."""
batch_size, seq_len = pixel_values.shape[:2]
noise = torch.rand((batch_size, seq_len)) # noise in [0, 1]
len_keep = int(seq_len * (1 - mask_ratio))
return noise, len_keep
def generate_audio_mask_noise(audio_values, audio_mask=None, mask_ratio=0.75, mask_type="patch-level", freq_len=8):
"""Generate noise for audio masking."""
batch_size, seq_len = audio_values.shape[:2]
if mask_type == "frame-level":
num_time_patches = seq_len // freq_len
noise = (
torch.rand(batch_size, num_time_patches).unsqueeze(-1).repeat(1, 1, freq_len).view(batch_size, seq_len)
) # noise in [0, 1]
elif mask_type == "patch-level":
noise = torch.rand(batch_size, seq_len) # noise in [0, 1]
len_keep = int(seq_len * (1 - mask_ratio))
return noise, len_keep
def random_masking(sequence, noise, len_keep, attention_masks=None):
"""
Perform random masking by per-sample shuffling on frame-level. Per-sample shuffling is done by argsort random
noise. sequence: [batch_size, seq_len, hidden_dim], sequence
"""
batch_size, seq_len, hidden_dim = sequence.shape
# sort noise for each sample
ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove
ids_restore = torch.argsort(ids_shuffle, dim=1)
# keep the first subset
ids_keep = ids_shuffle[:, :len_keep]
sequence_masked = torch.gather(sequence, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, hidden_dim))
# generate the binary mask: 0 is keep, 1 is remove
label_masks = torch.ones([batch_size, seq_len], device=sequence.device)
label_masks[:, :len_keep] = 0
# unshuffle to get the binary mask
label_masks = torch.gather(label_masks, dim=1, index=ids_restore)
if attention_masks is not None:
label_masks *= attention_masks
attention_masks = torch.gather(attention_masks, dim=1, index=ids_keep)
return sequence_masked, attention_masks, label_masks, ids_restore
class TvltPixelEmbeddings(nn.Module):
"""Construct the patch and position embeddings."""
def __init__(self, config):
super().__init__()
self.patch_embeddings = TvltPixelPatchEmbeddings(config)
self.num_patches_per_image = self.patch_embeddings.num_patches_per_image
self.type_embed_v = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
self.temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
self.pos_embed_v = nn.Parameter(torch.zeros(1, self.num_patches_per_image, config.hidden_size))
self.config = config
def forward(self, pixel_values, attention_masks=None):
# create patch embeddings
batch_size, num_frames, num_channels, height, width = pixel_values.shape
embeddings = self.patch_embeddings(pixel_values)
embeddings += self.pos_embed_v.repeat(1, num_frames, 1)
embeddings += torch.repeat_interleave(self.temporal_embed[:, :num_frames], self.num_patches_per_image, dim=1)
embeddings += self.type_embed_v
return embeddings, attention_masks
class TvltAudioEmbeddings(nn.Module):
"""Construct the patch and position embeddings."""
def __init__(self, config):
super().__init__()
self.patch_embeddings = TvltAudioPatchEmbeddings(config)
self.num_patches = self.patch_embeddings.num_patches
self.type_embed_a = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
self.num_freq_patches = config.frequency_length // config.audio_patch_size[1]
self.pos_embed_a = nn.Parameter(torch.zeros(1, self.num_patches // self.num_freq_patches, config.hidden_size))
self.freq_embed = nn.Parameter(torch.zeros(1, self.num_freq_patches, config.hidden_size))
self.num_freq_patches = config.frequency_length // config.audio_patch_size[1]
self.config = config
def forward(self, audio_values, attention_masks=None):
# create patch embeddings
embeddings = self.patch_embeddings(audio_values)
num_time_patches = embeddings.size(1) // self.num_freq_patches
embeddings += self.freq_embed.repeat(1, num_time_patches, 1)
embeddings += torch.repeat_interleave(self.pos_embed_a[:, :num_time_patches], self.num_freq_patches, dim=1)
embeddings += self.type_embed_a
return embeddings, attention_masks
class TvltPixelPatchEmbeddings(nn.Module):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
image_size, patch_size = config.image_size, config.image_patch_size
num_channels, hidden_size = config.num_image_channels, config.hidden_size
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches_per_image = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches_per_image = num_patches_per_image
self.hidden_size = hidden_size
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
batch_size, num_frames, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
if height != self.image_size[0] or width != self.image_size[1]:
raise ValueError(
f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
)
pixel_values = pixel_values.reshape(batch_size * num_frames, num_channels, height, width)
embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
embeddings = embeddings.reshape(batch_size, num_frames * self.num_patches_per_image, self.hidden_size)
return embeddings
class TvltAudioPatchEmbeddings(nn.Module):
"""
This class turns `audio_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
spectrogram_length, frequency_length, patch_size = (
config.spectrogram_length,
config.frequency_length,
config.audio_patch_size,
)
num_channels, hidden_size = config.num_audio_channels, config.hidden_size
spectrogram_size = (spectrogram_length, frequency_length)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
num_patches = (spectrogram_size[1] // patch_size[1]) * (spectrogram_size[0] // patch_size[0])
patch_shape = (spectrogram_size[0] // patch_size[0], spectrogram_size[1] // patch_size[1])
self.spectrogram_size = spectrogram_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.patch_shape = patch_shape
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, audio_values: torch.Tensor) -> torch.Tensor:
batch_size, num_channels, height, width = audio_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
if height > self.spectrogram_size[0] or width != self.spectrogram_size[1]:
raise ValueError(
f"Input audio size ({height}*{width}) doesn't match model"
f" ({self.spectrogram_size[0]}*{self.spectrogram_size[1]})."
)
embeddings = self.projection(audio_values).flatten(2).transpose(1, 2)
return embeddings
# Copied from transformers.models.vilt.modeling_vilt.ViltSelfAttention with Vilt->Tvlt
class TvltSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
f"heads {config.num_attention_heads}."
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
mixed_query_layer = self.query(hidden_states)
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
# Copied from transformers.models.vilt.modeling_vilt.ViltSelfOutput with Vilt->Tvlt
class TvltSelfOutput(nn.Module):
"""
The residual connection is defined in TvltLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
"""
def __init__(self, config: TvltConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
# Copied from transformers.models.vilt.modeling_vilt.ViltAttention with Vilt->Tvlt
class TvltAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.attention = TvltSelfAttention(config)
self.output = TvltSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
)
# Prune linear layers
self.attention.query = prune_linear_layer(self.attention.query, index)
self.attention.key = prune_linear_layer(self.attention.key, index)
self.attention.value = prune_linear_layer(self.attention.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# Update hyper params and store pruned heads
self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
self_outputs = self.attention(hidden_states, attention_mask, head_mask, output_attentions)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
# Copied from transformers.models.vilt.modeling_vilt.ViltIntermediate with Vilt->Tvlt
class TvltIntermediate(nn.Module):
def __init__(self, config: TvltConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
# Copied from transformers.models.vilt.modeling_vilt.ViltOutput with Vilt->Tvlt
class TvltOutput(nn.Module):
def __init__(self, config: TvltConfig) -> None:
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = hidden_states + input_tensor
return hidden_states
# Copied from transformers.models.vilt.modeling_vilt.ViltLayer with Vilt->Tvlt
class TvltLayer(nn.Module):
"""This corresponds to the Block class in the timm implementation."""
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = TvltAttention(config)
self.intermediate = TvltIntermediate(config)
self.output = TvltOutput(config)
self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
self_attention_outputs = self.attention(
self.layernorm_before(hidden_states), # in ViLT, layernorm is applied before self-attention
attention_mask,
head_mask,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
# first residual connection
hidden_states = attention_output + hidden_states.to(attention_output.device)
# in ViLT, layernorm is also applied after self-attention
layer_output = self.layernorm_after(hidden_states)
layer_output = self.intermediate(layer_output)
# second residual connection is done here
layer_output = self.output(layer_output, hidden_states)
outputs = (layer_output,) + outputs
return outputs
# Copied from transformers.models.vilt.modeling_vilt.ViltEncoder with Vilt->Tvlt
class TvltEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([TvltLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
if self.gradient_checkpointing and self.training:
def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs, output_attentions)
return custom_forward
layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(layer_module),
hidden_states,
attention_mask,
layer_head_mask,
)
else:
layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class TvltPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = TvltConfig
base_model_prefix = "tvlt"
main_input_name = "pixel_values"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, TvltEncoder):
module.gradient_checkpointing = value
TVLT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`TvltConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
TVLT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
details.
audio_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Audio values. Audio values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
details.
pixel_mask (`torch.FloatTensor` of shape `(batch_size, num_pixel_patches)`):
Pixel masks. Pixel masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
details.
audio_mask (`torch.FloatTensor` of shape `(batch_size, num_audio_patches)`):
Audio masks. Audio masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
details.
pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Pixel values mixed can
be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details.
pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel masks of pixel_values_mixed. Pixel masks mixed can be obtained using [`TvltProcessor`]. See
[`TvltProcessor.__call__`] for details.
mask_pixel (`bool`, *optional*):
Whether to mask pixel for MAE tasks. Only set to True in TvltForPreTraining.
mask_audio (`bool`, *optional*):
Whether to mask audio for MAE tasks. Only set to True in TvltForPreTraining.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare TVLT Model transformer outputting raw hidden-states without any specific head on top.",
TVLT_START_DOCSTRING,
)
class TvltModel(TvltPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.pixel_embeddings = TvltPixelEmbeddings(config)
self.audio_embeddings = TvltAudioEmbeddings(config)
self.encoder = TvltEncoder(config)
self.cls_embedding = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
if config.use_mean_pooling:
self.layernorm = None
else:
self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.pixel_embeddings.patch_embeddings, self.audio_embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TvltModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values,
audio_values,
pixel_mask=None,
audio_mask=None,
mask_pixel=False,
mask_audio=False,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
) -> Union[tuple, TvltModelOutput]:
r"""
Returns:
Examples:
```python
>>> from transformers import TvltProcessor, TvltModel
>>> import numpy as np
>>> import torch
>>> num_frames = 8
>>> images = list(np.random.randn(num_frames, 3, 224, 224))
>>> audio = list(np.random.randn(10000))
>>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
>>> model = TvltModel.from_pretrained("ZinengTang/tvlt-base")
>>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")
>>> outputs = model(**input_dict)
>>> loss = outputs.loss
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
pixel_embedding_output, pixel_mask = self.pixel_embeddings(pixel_values, pixel_mask)
audio_embedding_output, audio_mask = self.audio_embeddings(audio_values, audio_mask)
# Mask pixel if mask_pixel is True
pixel_label_masks = None
pixel_ids_restore = None
if mask_pixel:
pixel_mask_noise, pixel_len_keep = generate_pixel_mask_noise(
pixel_embedding_output, pixel_mask=pixel_mask, mask_ratio=self.config.pixel_mask_ratio
)
pixel_embedding_output, pixel_mask, pixel_label_masks, pixel_ids_restore = random_masking(
pixel_embedding_output,
pixel_mask_noise,
pixel_len_keep,
attention_masks=pixel_mask,
)
# Mask audio if mask_audio is True
audio_label_masks = None
audio_ids_restore = None
if mask_audio:
num_freq_patches = self.config.frequency_length // self.config.audio_patch_size[1]
audio_mask_noise, audio_len_keep = generate_audio_mask_noise(
audio_embedding_output,
audio_mask=audio_mask,
mask_ratio=self.config.audio_mask_ratio,
mask_type=self.config.audio_mask_type,
freq_len=num_freq_patches,
)
audio_embedding_output, audio_mask, audio_label_masks, audio_ids_restore = random_masking(
audio_embedding_output,
audio_mask_noise,
audio_len_keep,
attention_masks=audio_mask,
)
# Prepare for encoder inputs and attention masks
batch_size = pixel_values.size(0)
embedding_output = torch.cat(
[self.cls_embedding.repeat(batch_size, 1, 1), pixel_embedding_output, audio_embedding_output], 1
)
masked_pixel_len = pixel_embedding_output.size(1)
attention_mask = None
if pixel_mask is not None and audio_mask is not None:
attention_mask = torch.cat([pixel_mask[:, :1], pixel_mask, audio_mask], 1)
input_shape = embedding_output.size()
extended_attention_mask = None
if attention_mask is not None:
extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
if self.layernorm is not None:
sequence_output = self.layernorm(sequence_output)
pixel_sequence_output = sequence_output[:, 1 : 1 + masked_pixel_len]
audio_sequence_output = sequence_output[:, 1 + masked_pixel_len :]
if not return_dict:
return (
sequence_output,
pixel_sequence_output,
audio_sequence_output,
pixel_label_masks,
audio_label_masks,
pixel_ids_restore,
audio_ids_restore,
) + encoder_outputs[1:]
return TvltModelOutput(
last_hidden_state=sequence_output,
last_pixel_hidden_state=pixel_sequence_output,
last_audio_hidden_state=audio_sequence_output,
pixel_label_masks=pixel_label_masks,
audio_label_masks=audio_label_masks,
pixel_ids_restore=pixel_ids_restore,
audio_ids_restore=audio_ids_restore,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class TvltDecoder(nn.Module):
def __init__(self, config):
super().__init__()
decoder_config = deepcopy(config)
decoder_config.hidden_size = config.decoder_hidden_size
decoder_config.num_hidden_layers = config.decoder_num_hidden_layers
decoder_config.num_attention_heads = config.decoder_num_attention_heads
decoder_config.intermediate_size = config.decoder_intermediate_size
self.decoder_layers = nn.ModuleList(
[TvltLayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)]
)
self.layernorm = nn.LayerNorm(config.decoder_hidden_size, eps=config.layer_norm_eps)
self.gradient_checkpointing = False
self.config = config
def forward(
self,
hidden_states,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
# apply Transformer layers (blocks)
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.decoder_layers):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs, output_attentions)
return custom_forward
layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(layer_module),
hidden_states,
None,
)
else:
layer_outputs = layer_module(hidden_states, output_attentions=output_attentions)
hidden_states = layer_outputs[0]
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# predictor projection
logits = self.layernorm(hidden_states)
if not return_dict:
return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
return TvltDecoderOutput(logits=logits, hidden_states=all_hidden_states, attentions=all_self_attentions)
@add_start_docstrings(
"The TVLT Model transformer with the decoder on top for self-supervised pre-training.",
TVLT_START_DOCSTRING,
)
class TvltForPreTraining(TvltPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.task_matching = config.task_matching
self.task_mae = config.task_mae
if not (self.task_matching or self.task_mae):
raise ValueError("Must set at least one of matching task and MAE task to true")
self.tvlt = TvltModel(config)
if self.task_matching:
self.matching_head = TvltMatchingHead(config)
if self.task_mae:
self.encoder_to_decoder = nn.Linear(config.hidden_size, config.decoder_hidden_size, bias=True)
self.pixel_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
self.audio_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
self.decoder = TvltDecoder(config)
decoder_hidden_size = config.decoder_hidden_size
num_frames = config.num_frames
num_patches_per_image = self.tvlt.pixel_embeddings.num_patches_per_image
self.decoder_pixel_pos_embed = nn.Parameter(torch.zeros(1, num_patches_per_image, decoder_hidden_size))
self.decoder_temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, decoder_hidden_size))
self.decoder_pixel_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size))
num_audio_patches = self.tvlt.audio_embeddings.num_patches
num_freq_patches = config.frequency_length // config.audio_patch_size[1]
self.decoder_audio_pos_embed = nn.Parameter(
torch.zeros(1, num_audio_patches // num_freq_patches, decoder_hidden_size)
)
self.decoder_freq_embed = nn.Parameter(torch.zeros(1, num_freq_patches, decoder_hidden_size))
self.decoder_audio_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size))
pixel_mae_output_dim = self.config.image_patch_size[0] ** 2 * self.config.num_image_channels
self.pixel_mae_head = TvltMAEHead(config, pixel_mae_output_dim)
audio_mae_output_dim = (
self.config.audio_patch_size[0] * self.config.audio_patch_size[1] * self.config.num_audio_channels
)
self.audio_mae_head = TvltMAEHead(config, audio_mae_output_dim)
self.num_frames = num_frames
self.num_patches_per_image = num_patches_per_image
self.num_freq_patches = num_freq_patches
self.image_patch_size = config.image_patch_size
self.audio_patch_size = config.audio_patch_size
# Initialize weights and apply final processing
self.post_init()
def patchify_pixel(self, pixel_values):
"""
pixel_values: [batch_size, num_frames, 3, height, width]
"""
batch_size, num_frames, num_channels, height, width = pixel_values.shape
num_patches_height = pixel_values.shape[3] // self.image_patch_size[0]
num_patches_width = pixel_values.shape[4] // self.image_patch_size[1]
patchified_pixel_values = pixel_values.reshape(
shape=(
batch_size,
num_frames,
num_channels,
num_patches_height,
self.image_patch_size[0],
num_patches_width,
self.image_patch_size[1],
)
)
patchified_pixel_values = torch.einsum("ntchpwq->nthwpqc", patchified_pixel_values)
patchified_pixel_values = patchified_pixel_values.reshape(
shape=(
batch_size,
num_patches_height * num_patches_width * num_frames,
self.image_patch_size[0] * self.image_patch_size[1] * num_channels,
)
)
return patchified_pixel_values
def patchify_audio(self, audio_values):
"""
audio_values: [batch_size, 1, height, width]
"""
batch_size, num_channels, height, width = audio_values.shape
num_patches_height = height // self.audio_patch_size[0]
num_patches_width = width // self.audio_patch_size[1]
patchified_audio_values = audio_values.reshape(
shape=(
batch_size,
num_channels,
num_patches_height,
self.audio_patch_size[0],
num_patches_width,
self.audio_patch_size[1],
)
)
patchified_audio_values = torch.einsum("nchpwq->nhwpqc", patchified_audio_values)
patchified_audio_values = patchified_audio_values.reshape(
shape=(
batch_size,
num_patches_height * num_patches_width,
self.audio_patch_size[0] * self.audio_patch_size[1] * num_channels,
)
)
return patchified_audio_values
def pixel_mae_loss(self, pixel_values, pixel_predictions, mask):
patchified_pixel_values = self.patchify_pixel(pixel_values)
loss = (pixel_predictions - patchified_pixel_values) ** 2
loss = loss.mean(dim=-1) # [batch_size, pixel_pixel_length], mean loss per patch
loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches
return loss
def audio_mae_loss(self, audio_values, audio_predictions, mask):
patchified_audio_values = self.patchify_audio(audio_values)
loss = (audio_predictions - patchified_audio_values) ** 2
loss = loss.mean(dim=-1) # [batch_size, audio_pixel_length], mean loss per patch
loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches
return loss
def concatenate_mask(self, mask_token, sequence, ids_restore):
batch_size, seq_length, dim = sequence.shape
mask_tokens = mask_token.repeat(batch_size, ids_restore.shape[1] - seq_length, 1)
padded_sequence = torch.cat([sequence, mask_tokens], dim=1)
padded_sequence = torch.gather(
padded_sequence, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, dim)
) # unshuffle
return padded_sequence
@add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TvltForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values,
audio_values,
pixel_mask=None,
audio_mask=None,
labels=None,
pixel_values_mixed=None,
pixel_mask_mixed=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
) -> Union[tuple, TvltForPreTrainingOutput]:
r"""
pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Audio values can be
obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details.
pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel masks of pixel_values_mixed. Pixel values mixed can be obtained using [`TvltProcessor`]. See
[`TvltProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
Labels for computing the vision audio matching loss. Indices should be in `[0, 1]`. num_labels has to be 1.
Return:
Examples:
```python
>>> from transformers import TvltProcessor, TvltForPreTraining
>>> import numpy as np
>>> import torch
>>> num_frames = 8
>>> images = list(np.random.randn(num_frames, 3, 224, 224))
>>> images_mixed = list(np.random.randn(num_frames, 3, 224, 224))
>>> audio = list(np.random.randn(10000))
>>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
>>> model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base")
>>> input_dict = processor(
... images, audio, images_mixed, sampling_rate=44100, mask_pixel=True, mask_audio=True, return_tensors="pt"
... )
>>> outputs = model(**input_dict)
>>> loss = outputs.loss
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
total_loss = 0.0
if self.task_matching:
if labels is None:
raise ValueError("Matching task requires labels")
if pixel_values_mixed is None:
raise ValueError("Matching task requires pixel_values_mixed")
outputs = self.tvlt(
pixel_values_mixed,
audio_values,
pixel_mask=pixel_mask_mixed,
audio_mask=audio_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
matching_logits = self.matching_head(sequence_output)
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(matching_logits.view(-1), labels.view(-1))
total_loss += loss
pixel_logits = None
audio_logits = None
if self.task_mae and self.training:
outputs = self.tvlt(
pixel_values,
audio_values,
pixel_mask=pixel_mask,
audio_mask=audio_mask,
mask_pixel=True,
mask_audio=True,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pixel_sequence_output = outputs.last_pixel_hidden_state if return_dict else outputs[1]
audio_sequence_output = outputs.last_audio_hidden_state if return_dict else outputs[2]
pixel_label_masks = outputs.pixel_label_masks if return_dict else outputs[3]
audio_label_masks = outputs.audio_label_masks if return_dict else outputs[4]
pixel_ids_restore = outputs.pixel_ids_restore if return_dict else outputs[5]
audio_ids_restore = outputs.audio_ids_restore if return_dict else outputs[6]
pixel_decoder_input = self.encoder_to_decoder(
pixel_sequence_output
) # [batch_size, num_masked_pixel_patches, decoder_hidden_size]
audio_decoder_input = self.encoder_to_decoder(
audio_sequence_output
) # [batch_size, num_masked_audio_patches, decoder_hidden_size]
num_frames = pixel_values.size(1)
pixel_decoder_input = self.concatenate_mask(self.pixel_mask_token, pixel_decoder_input, pixel_ids_restore)
pixel_decoder_input = pixel_decoder_input + self.decoder_pixel_pos_embed.repeat(1, num_frames, 1)
pixel_decoder_input = pixel_decoder_input + torch.repeat_interleave(
self.decoder_temporal_embed[:, :num_frames], self.num_patches_per_image, dim=1
)
pixel_decoder_input = pixel_decoder_input + self.decoder_pixel_type_embed
pixel_decoder_outputs = self.decoder(pixel_decoder_input)
pixel_logits = self.pixel_mae_head(pixel_decoder_outputs.logits)
audio_decoder_input = self.concatenate_mask(self.audio_mask_token, audio_decoder_input, audio_ids_restore)
num_time_patches = audio_decoder_input.size(1) // self.num_freq_patches
audio_decoder_input = audio_decoder_input + self.decoder_freq_embed.repeat(1, num_time_patches, 1)
audio_decoder_input = audio_decoder_input + torch.repeat_interleave(
self.decoder_audio_pos_embed[:, :num_time_patches], self.num_freq_patches, dim=1
)
audio_decoder_input = audio_decoder_input + self.decoder_audio_type_embed
audio_decoder_outputs = self.decoder(audio_decoder_input)
audio_logits = self.audio_mae_head(audio_decoder_outputs.logits)
loss = self.pixel_mae_loss(pixel_values, pixel_logits, pixel_label_masks) + self.audio_mae_loss(
audio_values, audio_logits, audio_label_masks
)
total_loss += loss
if not return_dict:
output = (matching_logits, pixel_logits, audio_logits) + outputs[7:]
return ((total_loss,) + output) if loss is not None else output
return TvltForPreTrainingOutput(
loss=total_loss,
matching_logits=matching_logits,
pixel_logits=pixel_logits,
audio_logits=audio_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class TvltPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states):
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class TvltMatchingHead(nn.Module):
def __init__(self, config):
super().__init__()
self.pooler = TvltPooler(config)
self.fc = nn.Linear(config.hidden_size, 1)
def forward(self, hidden_states):
hidden_states = self.fc(self.pooler(hidden_states))
return hidden_states
class TvltMAEHead(nn.Module):
def __init__(self, config, output_dim=None):
super().__init__()
self.config = config
self.decoder = nn.Linear(config.decoder_hidden_size, output_dim)
def forward(self, hidden_states):
hidden_states = self.decoder(hidden_states)
return hidden_states
@add_start_docstrings(
"""
Tvlt Model transformer with a classifier head on top (an MLP on top of the final hidden state of the [CLS] token)
for audiovisual classification tasks, e.g. CMU-MOSEI Sentiment Analysis and Audio to Video Retrieval.
""",
TVLT_START_DOCSTRING,
)
class TvltForAudioVisualClassification(TvltPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.tvlt = TvltModel(config)
# Classifier head
self.classifier = nn.Sequential(
nn.Linear(config.hidden_size, config.hidden_size * 2),
nn.LayerNorm(config.hidden_size * 2, eps=config.layer_norm_eps),
nn.GELU(),
nn.Linear(config.hidden_size * 2, config.num_labels),
)
self.config = config
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values,
audio_values,
pixel_mask=None,
audio_mask=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
labels=None,
) -> Union[tuple, SequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
Labels for computing the audiovisual loss. Indices should be in `[0, ..., num_classes-1]` where num_classes
refers to the number of classes in audiovisual tasks.
Return:
Examples:
```python
>>> from transformers import TvltProcessor, TvltForAudioVisualClassification
>>> import numpy as np
>>> import torch
>>> num_frames = 8
>>> images = list(np.random.randn(num_frames, 3, 224, 224))
>>> audio = list(np.random.randn(10000))
>>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
>>> model = TvltForAudioVisualClassification.from_pretrained("ZinengTang/tvlt-base")
>>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")
>>> outputs = model(**input_dict)
>>> loss = outputs.loss
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.tvlt(
pixel_values,
audio_values,
pixel_mask=pixel_mask,
audio_mask=audio_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0][:, 0]
logits = self.classifier(sequence_output) # rank value
loss = None
if labels is not None:
if self.config.loss_type == "regression":
loss_fct = MSELoss()
loss = loss_fct(logits, labels)
elif self.config.loss_type == "classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[4:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for TVLT.
"""
from ...processing_utils import ProcessorMixin
class TvltProcessor(ProcessorMixin):
r"""
Constructs a TVLT processor which wraps a TVLT image processor and TVLT feature extractor into a single processor.
[`TvltProcessor`] offers all the functionalities of [`TvltImageProcessor`] and [`TvltFeatureExtractor`]. See the
docstring of [`~TvltProcessor.__call__`] for more information.
Args:
image_processor (`TvltImageProcessor`):
An instance of [`TvltImageProcessor`]. The image processor is a required input.
feature_extractor (`TvltFeatureExtractor`):
An instance of [`TvltFeatureExtractor`]. The feature extractor is a required input.
"""
attributes = ["image_processor", "feature_extractor"]
image_processor_class = "TvltImageProcessor"
feature_extractor_class = "TvltFeatureExtractor"
def __init__(self, image_processor, feature_extractor):
super().__init__(image_processor=image_processor, feature_extractor=feature_extractor)
self.image_processor = image_processor
self.feature_extractor = feature_extractor
def __call__(
self,
images=None,
audio=None,
images_mixed=None,
sampling_rate=None,
mask_audio=False,
mask_pixel=False,
*args,
**kwargs,
):
"""
Forwards the `images` argument to TvltImageProcessor's [`~TvltImageProcessor.preprocess`] and the `audio`
argument to TvltFeatureExtractor's [`~TvltFeatureExtractor.__call__`]. Please refer to the docstring of the
above two methods for more information.
"""
if images is None and audio is None:
raise ValueError("You need to specify either an `images` or `audio` input to process.")
images_mixed_dict = None
if images is not None:
images_dict = self.image_processor(images, mask_pixel=mask_pixel, *args, **kwargs)
if images_mixed is not None:
images_mixed_dict = self.image_processor(images_mixed, is_mixed=True, *args, **kwargs)
if audio is not None:
audio_dict = self.feature_extractor(
audio, *args, sampling_rate=sampling_rate, mask_audio=mask_audio, **kwargs
)
output_dict = {}
if audio is not None:
output_dict.update(audio_dict)
if images is not None:
output_dict.update(images_dict)
if images_mixed_dict is not None:
output_dict.update(images_mixed_dict)
return output_dict
@property
def model_input_names(self):
image_processor_input_names = self.image_processor.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(dict.fromkeys(image_processor_input_names + feature_extractor_input_names))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment