Unverified Commit 146c5212 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Merge branch 'master' into add_models_special_tokens_to_specific_configs

parents f5b50c6b b623ddc0
...@@ -11,12 +11,13 @@ from tqdm import tqdm ...@@ -11,12 +11,13 @@ from tqdm import tqdm
from modeling_bertabs import BertAbs, build_predictor from modeling_bertabs import BertAbs, build_predictor
from transformers import BertTokenizer from transformers import BertTokenizer
from utils_summarization import (
SummarizationDataset, from .utils_summarization import (
CNNDMDataset,
build_mask, build_mask,
compute_token_type_ids, compute_token_type_ids,
encode_for_summarization, encode_for_summarization,
fit_to_block_size, truncate_or_pad,
) )
...@@ -194,7 +195,7 @@ def build_data_iterator(args, tokenizer): ...@@ -194,7 +195,7 @@ def build_data_iterator(args, tokenizer):
def load_and_cache_examples(args, tokenizer): def load_and_cache_examples(args, tokenizer):
dataset = SummarizationDataset(args.documents_dir) dataset = CNNDMDataset(args.documents_dir)
return dataset return dataset
...@@ -211,7 +212,7 @@ def collate(data, tokenizer, block_size, device): ...@@ -211,7 +212,7 @@ def collate(data, tokenizer, block_size, device):
encoded_text = [encode_for_summarization(story, summary, tokenizer) for _, story, summary in data] encoded_text = [encode_for_summarization(story, summary, tokenizer) for _, story, summary in data]
encoded_stories = torch.tensor( encoded_stories = torch.tensor(
[fit_to_block_size(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text] [truncate_or_pad(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text]
) )
encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id) encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id) encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
......
...@@ -17,7 +17,7 @@ import unittest ...@@ -17,7 +17,7 @@ import unittest
import numpy as np import numpy as np
import torch import torch
from utils_summarization import build_mask, compute_token_type_ids, fit_to_block_size, process_story from .utils_summarization import build_mask, compute_token_type_ids, process_story, truncate_or_pad
class SummarizationDataProcessingTest(unittest.TestCase): class SummarizationDataProcessingTest(unittest.TestCase):
...@@ -28,19 +28,19 @@ class SummarizationDataProcessingTest(unittest.TestCase): ...@@ -28,19 +28,19 @@ class SummarizationDataProcessingTest(unittest.TestCase):
""" Pad the sequence with 0 if the sequence is smaller than the block size.""" """ Pad the sequence with 0 if the sequence is smaller than the block size."""
sequence = [1, 2, 3, 4] sequence = [1, 2, 3, 4]
expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0] expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output) self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
def test_fit_to_block_sequence_fit_exactly(self): def test_fit_to_block_sequence_fit_exactly(self):
""" Do nothing if the sequence is the right size. """ """ Do nothing if the sequence is the right size. """
sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output) self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
def test_fit_to_block_sequence_too_big(self): def test_fit_to_block_sequence_too_big(self):
""" Truncate the sequence if it is too long. """ """ Truncate the sequence if it is too long. """
sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output) self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
def test_process_story_no_highlights(self): def test_process_story_no_highlights(self):
""" Processing a story with no highlights returns an empty list for the summary. """ Processing a story with no highlights returns an empty list for the summary.
......
...@@ -10,7 +10,7 @@ from torch.utils.data import Dataset ...@@ -10,7 +10,7 @@ from torch.utils.data import Dataset
# ------------ # ------------
class SummarizationDataset(Dataset): class CNNDMDataset(Dataset):
""" Abstracts the dataset used to train seq2seq models. """ Abstracts the dataset used to train seq2seq models.
The class will process the documents that are located in the specified The class will process the documents that are located in the specified
...@@ -62,11 +62,11 @@ class SummarizationDataset(Dataset): ...@@ -62,11 +62,11 @@ class SummarizationDataset(Dataset):
def process_story(raw_story): def process_story(raw_story):
""" Extract the story and summary from a story file. """ Extract the story and summary from a story file.
Attributes: Arguments:
raw_story (str): content of the story file as an utf-8 encoded string. raw_story (str): content of the story file as an utf-8 encoded string.
Raises: Raises:
IndexError: If the stoy is empty or contains no highlights. IndexError: If the story is empty or contains no highlights.
""" """
nonempty_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])) nonempty_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")]))
...@@ -107,7 +107,7 @@ def _add_missing_period(line): ...@@ -107,7 +107,7 @@ def _add_missing_period(line):
# -------------------------- # --------------------------
def fit_to_block_size(sequence, block_size, pad_token_id): def truncate_or_pad(sequence, block_size, pad_token_id):
""" Adapt the source and target sequences' lengths to the block size. """ Adapt the source and target sequences' lengths to the block size.
If the sequence is shorter we append padding token to the right of the sequence. If the sequence is shorter we append padding token to the right of the sequence.
""" """
......
---
language:
- bulgarian
- czech
- polish
- russian
---
# bert-base-bg-cs-pl-ru-cased
SlavicBERT\[1\] \(Slavic \(bg, cs, pl, ru\), cased, 12-layer, 768-hidden, 12-heads, 180M parameters\) was trained
on Russian News and four Wikipedias: Bulgarian, Czech, Polish, and Russian.
Subtoken vocabulary was built using this data. Multilingual BERT was used as an initialization for SlavicBERT.
\[1\]: Arkhipov M., Trofimova M., Kuratov Y., Sorokin A. \(2019\).
[Tuning Multilingual Transformers for Language-Specific Named Entity Recognition](https://www.aclweb.org/anthology/W19-3712/).
ACL anthology W19-3712.
---
language:
- english
---
# bert-base-cased-conversational
Conversational BERT \(English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters\) was trained
on the English part of Twitter, Reddit, DailyDialogues\[1\], OpenSubtitles\[2\], Debates\[3\], Blogs\[4\],
Facebook News Comments. We used this training data to build the vocabulary of English subtokens and took
English cased version of BERT-base as an initialization for English Conversational BERT.
\[1\]: Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. DailyDialog: A Manually Labelled
Multi-turn Dialogue Dataset. IJCNLP 2017.
\[2\]: P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles.
In Proceedings of the 10th International Conference on Language Resources and Evaluation \(LREC 2016\)
\[3\]: Justine Zhang, Ravi Kumar, Sujith Ravi, Cristian Danescu-Niculescu-Mizil. Proceedings of NAACL, 2016.
\[4\]: J. Schler, M. Koppel, S. Argamon and J. Pennebaker \(2006\). Effects of Age and Gender on Blogging
in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs.
---
language:
- multilingual
---
# bert-base-multilingual-cased-sentence
Sentence Multilingual BERT \(101 languages, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\)
is a representation-based sentence encoder for 101 languages of Multilingual BERT.
It is initialized with Multilingual BERT and then fine-tuned on english MultiNLI\[1\] and on dev set
of multilingual XNLI\[2\].
Sentence representations are mean pooled token embeddings in the same manner as in Sentence-BERT\[3\].
\[1\]: Williams A., Nangia N. & Bowman S. \(2017\) A Broad-Coverage Challenge Corpus for Sentence Understanding
through Inference. arXiv preprint [arXiv:1704.05426](https://arxiv.org/abs/1704.05426)
\[2\]: Williams A., Bowman S. \(2018\) XNLI: Evaluating Cross-lingual Sentence Representations.
arXiv preprint [arXiv:1809.05053](https://arxiv.org/abs/1809.05053)
\[3\]: N. Reimers, I. Gurevych \(2019\) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks.
arXiv preprint [arXiv:1908.10084](https://arxiv.org/abs/1908.10084)
---
language:
- russian
---
# rubert-base-cased-conversational
Conversational RuBERT \(Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\) was trained
on OpenSubtitles\[1\], [Dirty](https://d3.ru/), [Pikabu](https://pikabu.ru/),
and a Social Media segment of Taiga corpus\[2\]. We assembled a new vocabulary for Conversational RuBERT model
on this data and initialized the model with [RuBERT](../rubert-base-cased).
\[1\]: P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles.
In Proceedings of the 10th International Conference on Language Resources and Evaluation \(LREC 2016\)
\[2\]: Shavrina T., Shapovalova O. \(2017\) TO THE METHODOLOGY OF CORPUS CONSTRUCTION FOR MACHINE LEARNING:
«TAIGA» SYNTAX TREE CORPUS AND PARSER. in proc. of “CORPORA2017”, international conference , Saint-Petersbourg, 2017.
---
language:
- russian
---
# rubert-base-cased-sentence
Sentence RuBERT \(Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\)
is a representation-based sentence encoder for Russian. It is initialized with RuBERT and fine-tuned on SNLI\[1\]
google-translated to russian and on russian part of XNLI dev set\[2\]. Sentence representations are mean pooled
token embeddings in the same manner as in Sentence-BERT\[3\].
\[1\]: S. R. Bowman, G. Angeli, C. Potts, and C. D. Manning. \(2015\) A large annotated corpus for learning
natural language inference. arXiv preprint [arXiv:1508.05326](https://arxiv.org/abs/1508.05326)
\[2\]: Williams A., Bowman S. \(2018\) XNLI: Evaluating Cross-lingual Sentence Representations.
arXiv preprint [arXiv:1809.05053](https://arxiv.org/abs/1809.05053)
\[3\]: N. Reimers, I. Gurevych \(2019\) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks.
arXiv preprint [arXiv:1908.10084](https://arxiv.org/abs/1908.10084)
---
language:
- russian
---
# rubert-base-cased
RuBERT \(Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\) was trained on the Russian part of Wikipedia
and news data. We used this training data to build a vocabulary of Russian subtokens and took a multilingual version
of BERT-base as an initialization for RuBERT\[1\].
\[1\]: Kuratov, Y., Arkhipov, M. \(2019\). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language.
arXiv preprint [arXiv:1905.07213](https://arxiv.org/abs/1905.07213).
---
language: arabic
---
# Arabic BERT Model
Pretrained BERT base language model for Arabic
## Pretraining Corpus
`arabic-bert-base` model was pretrained on ~8.2 Billion words:
- Arabic version of [OSCAR](https://traces1.inria.fr/oscar/) - filtered from [Common Crawl](http://commoncrawl.org/)
- Recent dump of Arabic [Wikipedia](https://dumps.wikimedia.org/backup-index.html)
and other Arabic resources which sum up to ~95GB of text.
__Notes on training data:__
- Our final version of corpus contains some non-Arabic words inlines, which we did not remove from sentences since that would affect some tasks like NER.
- Although non-Arabic characters were lowered as a preprocessing step, since Arabic characters does not have upper or lower case, there is no cased and uncased version of the model.
- The corpus and vocabulary set are not restricted to Modern Standard Arabic, they contain some dialectical Arabic too.
## Pretraining details
- This model was trained using Google BERT's github [repository](https://github.com/google-research/bert) on a single TPU v3-8 provided for free from [TFRC](https://www.tensorflow.org/tfrc).
- Our pretraining procedure follows training settings of bert with some changes: trained for 3M training steps with batchsize of 128, instead of 1M with batchsize of 256.
## Load Pretrained Model
You can use this model by installing `torch` or `tensorflow` and Huggingface library `transformers`. And you can use it directly by initializing it like this:
```python
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
model = AutoModel.from_pretrained("asafaya/bert-base-arabic")
```
## Results
For further details on the models performance or any other queries, please refer to [Arabic-BERT](https://github.com/alisafaya/Arabic-BERT)
## Acknowledgement
Thanks to Google for providing free TPU for the training process and for Huggingface for hosting this model on their servers 😊
---
language: arabic
---
# AraBERT : Pre-training BERT for Arabic Language Understanding
**AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config.
There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
We evalaute both AraBERT models on different downstream tasks and compare it to [mBERT]((https://github.com/google-research/bert/blob/master/multilingual.md)), and other state of the art models (*To the extent of our knowledge*). The Tasks were Sentiment Analysis on 6 different datasets ([HARD](https://github.com/elnagara/HARD-Arabic-Dataset), [ASTD-Balanced](https://www.aclweb.org/anthology/D15-1299), [ArsenTD-Lev](https://staff.aub.edu.lb/~we07/Publications/ArSentD-LEV_Sentiment_Corpus.pdf), [LABR](https://github.com/mohamedadaly/LABR), [ArSaS](http://lrec-conf.org/workshops/lrec2018/W30/pdf/22_W30.pdf)), Named Entity Recognition with the [ANERcorp](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp), and Arabic Question Answering on [Arabic-SQuAD and ARCD](https://github.com/husseinmozannar/SOQAL)
## Results (Acc.)
Task | prev. SOTA | mBERT | AraBERTv0.1 | AraBERTv1
---|:---:|:---:|:---:|:---:
HARD |95.7 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)|95.7|96.2|96.1
ASTD |86.5 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)| 80.1|92.2|92.6
ArsenTD-Lev|52.4 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)|51|58.9|59.4
AJGT|93 [Dahou et.al.](https://dl.acm.org/doi/fullHtml/10.1145/3314941)| 83.6|94.1|93.8
LABR|87.5 [Dahou et.al.](https://dl.acm.org/doi/fullHtml/10.1145/3314941)|83|85.9|86.7
ANERcorp|81.7 (BiLSTM-CRF)|78.4|84.2|81.9
ARCD|mBERT|EM:34.2 F1: 61.3|EM:30.1 F1:61.2|EM:30.6 F1: 62.7
*We would be extremly thankful if everyone can contibute to the Results table by adding more scores on different datasets*
## How to use
You can easily use AraBERT since it is almost fully compatible with existing codebases (You can use this repo instead of the official BERT one, the only difference is in the ```tokenization.py``` file where we modify the _is_punctuation function to make it compatible with the "+" symbol and the "[" and "]" characters)
To use HuggingFace's Transformer repository you only need to provide a lost of token that forces the model to not split them, also make sure that the text is pre-segmented:
```python
from transformers import AutoTokenizer
from preprocess_arabert import never_split_tokens
arabert_tokenizer = AutoTokenizer.from_pretrained(
"aubmindlab/bert-base-arabert",
do_lower_case=False,
do_basic_tokenize=True,
never_split=never_split_tokens)
arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabert")
arabert_tokenizer.tokenize("و+ لن نبالغ إذا قل +نا إن هاتف أو كمبيوتر ال+ مكتب في زمن +نا هذا ضروري")
>>> ['و+', 'لن', 'نبال', '##غ', 'إذا', 'قل', '+نا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'ال+', 'مكتب', 'في', 'زمن', '+نا', 'هذا', 'ضروري']
```
**AraBERTv0.1 is compatible with all existing libraries, since it needs no pre-segmentation.**
```python
from transformers import AutoTokenizer
from preprocess_arabert import never_split_tokens
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv01",do_lower_case=False)
arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv01")
arabert_tokenizer.tokenize("ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري")
>>> ['ولن', 'ن', '##بالغ', 'إذا', 'قلنا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'المكتب', 'في', 'زمن', '##ن', '##ا', 'هذا', 'ضروري']
```
The ```araBERT_(initial_Demo_TF)_.ipynb``` Notebook is a small demo using the AJGT dataset using TensorFlow (GPU and TPU compatible).
## Model Weights and Vocab Download
Models | AraBERTv0.1 | AraBERTv1
---|:---:|:---:
TensorFlow|[Drive Link](https://drive.google.com/open?id=1-kVmTUZZ4DP2rzeHNjTPkY8OjnQCpomO) | [Drive Link](https://drive.google.com/open?id=1-d7-9ljKgDJP5mx73uBtio-TuUZCqZnt)
PyTorch| [Drive_Link](https://drive.google.com/open?id=1-_3te42mQCPD8SxwZ3l-VBL7yaJH-IOv)| [Drive_Link](https://drive.google.com/open?id=1-69s6Pxqbi63HOQ1M9wTcr-Ovc6PWLLo)
**You can find the PyTorch models in HuggingFace's Transformer Library under the ```aubmindlab``` username**
## If you used this model please cite us as:
```
@misc{antoun2020arabert,
title={AraBERT: Transformer-based Model for Arabic Language Understanding},
author={Wissam Antoun and Fady Baly and Hazem Hajj},
year={2020},
eprint={2003.00104},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
## Acknowledgments
Thanks to TensorFlow Research Cloud (TFRC) for the free access to Cloud TPUs, couldn't have done it without this program, and to the [AUB MIND Lab](https://sites.aub.edu.lb/mindlab/) Members for the continous support. Also thanks to [Yakshof](https://www.yakshof.com/#/) and Assafir for data and storage access.
## Contacts
**Wissam Antoun**: [Linkedin](https://www.linkedin.com/in/giulio-ravasio-3a81a9110/) | [Twitter](https://twitter.com/wissam_antoun) | [Github](https://github.com/WissamAntoun) | <wfa07@mail.aub.edu> | <wissam.antoun@gmail.com>
**Fady Baly**: [Linkedin](https://www.linkedin.com/in/fadybaly/) | [Twitter](https://twitter.com/BalyFady) | [Github](https://github.com/fadybaly) | <fgb06@mail.aub.edu> | <baly.fady@gmail.com>
***We are looking for sponsors to train BERT-Large and other Transformer models, the sponsor only needs to cover to data storage and compute cost of the generating the pretraining data***
---
language: arabic
---
# AraBERT : Pre-training BERT for Arabic Language Understanding
**AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config.
There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
We evalaute both AraBERT models on different downstream tasks and compare it to [mBERT]((https://github.com/google-research/bert/blob/master/multilingual.md)), and other state of the art models (*To the extent of our knowledge*). The Tasks were Sentiment Analysis on 6 different datasets ([HARD](https://github.com/elnagara/HARD-Arabic-Dataset), [ASTD-Balanced](https://www.aclweb.org/anthology/D15-1299), [ArsenTD-Lev](https://staff.aub.edu.lb/~we07/Publications/ArSentD-LEV_Sentiment_Corpus.pdf), [LABR](https://github.com/mohamedadaly/LABR), [ArSaS](http://lrec-conf.org/workshops/lrec2018/W30/pdf/22_W30.pdf)), Named Entity Recognition with the [ANERcorp](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp), and Arabic Question Answering on [Arabic-SQuAD and ARCD](https://github.com/husseinmozannar/SOQAL)
## Results (Acc.)
Task | prev. SOTA | mBERT | AraBERTv0.1 | AraBERTv1
---|:---:|:---:|:---:|:---:
HARD |95.7 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)|95.7|96.2|96.1
ASTD |86.5 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)| 80.1|92.2|92.6
ArsenTD-Lev|52.4 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)|51|58.9|59.4
AJGT|93 [Dahou et.al.](https://dl.acm.org/doi/fullHtml/10.1145/3314941)| 83.6|94.1|93.8
LABR|87.5 [Dahou et.al.](https://dl.acm.org/doi/fullHtml/10.1145/3314941)|83|85.9|86.7
ANERcorp|81.7 (BiLSTM-CRF)|78.4|84.2|81.9
ARCD|mBERT|EM:34.2 F1: 61.3|EM:30.1 F1:61.2|EM:30.6 F1: 62.7
*We would be extremly thankful if everyone can contibute to the Results table by adding more scores on different datasets*
## How to use
You can easily use AraBERT since it is almost fully compatible with existing codebases (You can use this repo instead of the official BERT one, the only difference is in the ```tokenization.py``` file where we modify the _is_punctuation function to make it compatible with the "+" symbol and the "[" and "]" characters)
To use HuggingFace's Transformer repository you only need to provide a lost of token that forces the model to not split them, also make sure that the text is pre-segmented:
```python
from transformers import AutoTokenizer
from preprocess_arabert import never_split_tokens
arabert_tokenizer = AutoTokenizer.from_pretrained(
"aubmindlab/bert-base-arabert",
do_lower_case=False,
do_basic_tokenize=True,
never_split=never_split_tokens)
arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabert")
arabert_tokenizer.tokenize("و+ لن نبالغ إذا قل +نا إن هاتف أو كمبيوتر ال+ مكتب في زمن +نا هذا ضروري")
>>> ['و+', 'لن', 'نبال', '##غ', 'إذا', 'قل', '+نا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'ال+', 'مكتب', 'في', 'زمن', '+نا', 'هذا', 'ضروري']
```
**AraBERTv0.1 is compatible with all existing libraries, since it needs no pre-segmentation.**
```python
from transformers import AutoTokenizer
from preprocess_arabert import never_split_tokens
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv01",do_lower_case=False)
arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv01")
arabert_tokenizer.tokenize("ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري")
>>> ['ولن', 'ن', '##بالغ', 'إذا', 'قلنا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'المكتب', 'في', 'زمن', '##ن', '##ا', 'هذا', 'ضروري']
```
The ```araBERT_(initial_Demo_TF)_.ipynb``` Notebook is a small demo using the AJGT dataset using TensorFlow (GPU and TPU compatible).
## Model Weights and Vocab Download
Models | AraBERTv0.1 | AraBERTv1
---|:---:|:---:
TensorFlow|[Drive Link](https://drive.google.com/open?id=1-kVmTUZZ4DP2rzeHNjTPkY8OjnQCpomO) | [Drive Link](https://drive.google.com/open?id=1-d7-9ljKgDJP5mx73uBtio-TuUZCqZnt)
PyTorch| [Drive_Link](https://drive.google.com/open?id=1-_3te42mQCPD8SxwZ3l-VBL7yaJH-IOv)| [Drive_Link](https://drive.google.com/open?id=1-69s6Pxqbi63HOQ1M9wTcr-Ovc6PWLLo)
**You can find the PyTorch models in HuggingFace's Transformer Library under the ```aubmindlab``` username**
## If you used this model please cite us as:
```
@misc{antoun2020arabert,
title={AraBERT: Transformer-based Model for Arabic Language Understanding},
author={Wissam Antoun and Fady Baly and Hazem Hajj},
year={2020},
eprint={2003.00104},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
## Acknowledgments
Thanks to TensorFlow Research Cloud (TFRC) for the free access to Cloud TPUs, couldn't have done it without this program, and to the [AUB MIND Lab](https://sites.aub.edu.lb/mindlab/) Members for the continous support. Also thanks to [Yakshof](https://www.yakshof.com/#/) and Assafir for data and storage access.
## Contacts
**Wissam Antoun**: [Linkedin](https://www.linkedin.com/in/giulio-ravasio-3a81a9110/) | [Twitter](https://twitter.com/wissam_antoun) | [Github](https://github.com/WissamAntoun) | <wfa07@mail.aub.edu> | <wissam.antoun@gmail.com>
**Fady Baly**: [Linkedin](https://www.linkedin.com/in/fadybaly/) | [Twitter](https://twitter.com/BalyFady) | [Github](https://github.com/fadybaly) | <fgb06@mail.aub.edu> | <baly.fady@gmail.com>
***We are looking for sponsors to train BERT-Large and other Transformer models, the sponsor only needs to cover to data storage and compute cost of the generating the pretraining data***
Slavic BERT from https://github.com/deepmipt/Slavic-BERT-NER http://files.deeppavlov.ai/deeppavlov_data/bg_cs_pl_ru_cased_L-12_H-768_A-12.tar.gz
---
language: french
---
# camembert-base-fquad
## Description
A baseline model for question-answering in french ([CamemBERT](https://camembert-model.fr/) model fine-tuned on [FQuAD](https://fquad.illuin.tech/))
## Training hyperparameters
```shell
python3 ./examples/run_squad.py \
--model_type camembert \
--model_name_or_path camembert-base \
--do_train \
--do_eval \
--do_lower_case \
--train_file train.json \
--predict_file valid.json \
--learning_rate 3e-5 \
--num_train_epochs 2 \
--max_seq_length 384 \
--doc_stride 128 \
--output_dir output \
--per_gpu_eval_batch_size=3 \
--per_gpu_train_batch_size=3 \
--save_steps 10000
```
## Evaluation results
```shell
{"f1": 77.24515316052342, "exact_match": 52.82308657465496}
```
## Usage
```python
from transformers import pipeline
nlp = pipeline('question-answering', model='fmikaelian/camembert-base-fquad', tokenizer='fmikaelian/camembert-base-fquad')
nlp({
'question': "Qui est Claude Monet?",
'context': "Claude Monet, né le 14 novembre 1840 à Paris et mort le 5 décembre 1926 à Giverny, est un peintre français et l’un des fondateurs de l'impressionnisme."
})
```
\ No newline at end of file
---
language: french
---
# camembert-base-squad
## Description
A baseline model for question-answering in french ([CamemBERT](https://camembert-model.fr/) model fine-tuned on [french-translated SQuAD 1.1 dataset](https://github.com/Alikabbadj/French-SQuAD))
## Training hyperparameters
```shell
python3 ./examples/run_squad.py \
--model_type camembert \
--model_name_or_path camembert-base \
--do_train \
--do_eval \
--do_lower_case \
--train_file SQuAD-v1.1-train_fr_ss999_awstart2_net.json \
--predict_file SQuAD-v1.1-dev_fr_ss999_awstart2_net.json \
--learning_rate 3e-5 \
--num_train_epochs 2 \
--max_seq_length 384 \
--doc_stride 128 \
--output_dir output3 \
--per_gpu_eval_batch_size=3 \
--per_gpu_train_batch_size=3 \
--save_steps 10000
```
## Evaluation results
```shell
{"f1": 79.8570684959745, "exact_match": 59.21327108373895}
```
## Usage
```python
from transformers import pipeline
nlp = pipeline('question-answering', model='fmikaelian/camembert-base-squad', tokenizer='fmikaelian/camembert-base-squad')
nlp({
'question': "Qui est Claude Monet?",
'context': "Claude Monet, né le 14 novembre 1840 à Paris et mort le 5 décembre 1926 à Giverny, est un peintre français et l’un des fondateurs de l'impressionnisme."
})
```
\ No newline at end of file
# GPT-2 (medium) Taboo
## What is it?
A fine-tuned GPT-2 version for Taboo cards generation.
## Training setting
The model was trained on ~900 Taboo cards in the following format for 100 epochs:
```
Describe the word Glitch without using the words Problem, Unexpected, Technology, Minor, Outage.
````
---
language: multilingual
thumbnail:
---
# BERT (base-multilingual-cased) fine-tuned on XQuAD
This model was created by [Google](https://github.com/google-research/bert/blob/master/multilingual.md) and fine-tuned on [XQuAD](https://github.com/deepmind/xquad) for multilingual (`11 different languages`) **Q&A** downstream task.
## Details of the language model('bert-base-multilingual-cased')
[Language model](https://github.com/google-research/bert/blob/master/multilingual.md)
| Languages | Heads | Layers | Hidden | Params |
| --------- | ----- | ------ | ------ | ------ |
| 104 | 12 | 12 | 768 | 100 M |
## Details of the downstream task (multilingual Q&A) - Dataset
Deepmind [XQuAD](https://github.com/deepmind/xquad)
Languages covered:
- Arabic: `ar`
- German: `de`
- Greek: `el`
- English: `en`
- Spanish: `es`
- Hindi: `hi`
- Russian: `ru`
- Thai: `th`
- Turkish: `tr`
- Vietnamese: `vi`
- Chinese: `zh`
As the dataset is based on SQuAD v1.1, there are no unanswerable questions in the data. We chose this
setting so that models can focus on cross-lingual transfer.
We show the average number of tokens per paragraph, question, and answer for each language in the
table below. The statistics were obtained using [Jieba](https://github.com/fxsjy/jieba) for Chinese
and the [Moses tokenizer](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl)
for the other languages.
| | en | es | de | el | ru | tr | ar | vi | th | zh | hi |
| --------- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| Paragraph | 142.4 | 160.7 | 139.5 | 149.6 | 133.9 | 126.5 | 128.2 | 191.2 | 158.7 | 147.6 | 232.4 |
| Question | 11.5 | 13.4 | 11.0 | 11.7 | 10.0 | 9.8 | 10.7 | 14.8 | 11.5 | 10.5 | 18.7 |
| Answer | 3.1 | 3.6 | 3.0 | 3.3 | 3.1 | 3.1 | 3.1 | 4.5 | 4.1 | 3.5 | 5.6 |
Citation:
<details>
```
@article{Artetxe:etal:2019,
author = {Mikel Artetxe and Sebastian Ruder and Dani Yogatama},
title = {On the cross-lingual transferability of monolingual representations},
journal = {CoRR},
volume = {abs/1910.11856},
year = {2019},
archivePrefix = {arXiv},
eprint = {1910.11856}
}
```
</details>
I used `Data augmentation techniques` and splited the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
| Dataset | # samples |
| ----------- | --------- |
| XQUAD train | 50 K |
| XQUAD test | 8 K |
## Model training
The model was trained on a Tesla P100 GPU and 25GB of RAM.
The script for fine tuning can be found [here](https://github.com/huggingface/transformers/blob/master/examples/distillation/run_squad_w_distillation.py)
## Results:
| Metric | # Value |
| --------- | ----------- |
| **Exact** | **91.43** |
| **F1** | **94.14** |
## Model in action
Fast usage with **pipelines**:
```python
from transformers import pipeline
from transformers import pipeline
qa_pipeline = pipeline(
"question-answering",
model="mrm8488/bert-multi-cased-finetuned-xquadv1",
tokenizer="mrm8488/bert-multi-cased-finetuned-xquadv1"
)
# context: Coronavirus is seeding panic in the West because it expands so fast.
# question: Where is seeding panic Coronavirus?
qa_pipeline({
'context': "कोरोनावायरस पश्चिम में आतंक बो रहा है क्योंकि यह इतनी तेजी से फैलता है।",
'question': "कोरोनावायरस घबराहट कहां है?"
})
# output: {'answer': 'पश्चिम', 'end': 18, 'score': 0.7037217439689059, 'start': 12}
qa_pipeline({
'context': "Manuel Romero has been working hardly in the repository hugginface/transformers lately",
'question': "Who has been working hard for hugginface/transformers lately?"
})
# output: {'answer': 'Manuel Romero', 'end': 13, 'score': 0.7254485993702389, 'start': 0}
qa_pipeline({
'context': "Manuel Romero a travaillé à peine dans le référentiel hugginface / transformers ces derniers temps",
'question': "Pour quel référentiel a travaillé Manuel Romero récemment?"
})
#output: {'answer': 'hugginface / transformers', 'end': 79, 'score': 0.6482061613915384, 'start': 54}
```
![model in action](https://media.giphy.com/media/MBlire8Wj7ng73VBQ5/giphy.gif)
Try it on a Colab:
<a href="https://colab.research.google.com/github/mrm8488/shared_colab_notebooks/blob/master/Try_mrm8488_xquad_finetuned_model.ipynb" target="_parent"><img src="https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667" alt="Open In Colab" data-canonical-src="https://colab.research.google.com/assets/colab-badge.svg"></a>
> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
> Made with <span style="color: #e25555;">&hearts;</span> in Spain
...@@ -19,23 +19,30 @@ I preprocessed the dataset and splitted it as train / dev (80/20) ...@@ -19,23 +19,30 @@ I preprocessed the dataset and splitted it as train / dev (80/20)
| Dev | 2.2 K | | Dev | 2.2 K |
- [Fine-tune on NER script](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) - [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py)
```bash
!export NER_DIR='/content/ner_dataset'
!python /content/transformers/examples/run_ner.py \
--model_type bert \
--model_name_or_path dccuchile/bert-base-spanish-wwm-cased \
--do_train \
--do_eval \
--data_dir '/content/ner_dataset' \
--num_train_epochs 15.0 \
--max_seq_length 384 \
--output_dir /content/model_output \
--save_steps 5000 \
- Labels covered:
```
B-LOC
B-MISC
B-ORG
B-PER
I-LOC
I-MISC
I-ORG
I-PER
O
``` ```
## Metrics on evaluation set:
| Metric | # score |
| :------------------------------------------------------------------------------------: | :-------: |
| F1 | **90.17**
| Precision | **89.86** |
| Recall | **90.47** |
## Comparison: ## Comparison:
| Model | # score | | Model | # score |
...@@ -44,13 +51,24 @@ I preprocessed the dataset and splitted it as train / dev (80/20) ...@@ -44,13 +51,24 @@ I preprocessed the dataset and splitted it as train / dev (80/20)
| [bert-spanish-cased-finetuned-ner (this one)](https://huggingface.co/mrm8488/bert-spanish-cased-finetuned-ner) | **89.65** | | [bert-spanish-cased-finetuned-ner (this one)](https://huggingface.co/mrm8488/bert-spanish-cased-finetuned-ner) | **89.65** |
| Best Multilingual BERT | 87.38 | | Best Multilingual BERT | 87.38 |
``` ## Model in action
***** All metrics on Eval results *****
Fast usage with **pipelines**:
```python
from transformers import pipeline
nlp_ner = pipeline(
"ner",
model="mrm8488/bert-spanish-cased-finetuned-ner",
tokenizer=(
'mrm8488/bert-spanish-cased-finetuned-ner',
{"use_fast": False}
))
nlp_ner(text)
f1 = 0.8965040489828165 #Output: [{'entity': 'B-LOC', 'score': 0.9998720288276672, 'word': 'Londres'}]
loss = 0.11504213575173258
precision = 0.893679858239811
recall = 0.8993461462254805
``` ```
> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488) > Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment