Merge branch 'master' into add_models_special_tokens_to_specific_configs

146c5212 · Lysandre Debut · GitHub · f5b50c6b · b623ddc0 · 146c5212
Unverified Commit 146c5212 authored Mar 05, 2020 by Lysandre Debut Committed by GitHub Mar 05, 2020
20 changed files
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
--- a/examples/summarization/requirements.txt
+++ b/examples/summarization/requirements.txt
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -11,12 +11,13 @@ from tqdm import tqdm
 from modeling_bertabs import BertAbs, build_predictor
 from transformers import BertTokenizer
-from utils_summarization import (
-    SummarizationDataset,
+from .utils_summarization import (
+    CNNDMDataset,
    build_mask,
    compute_token_type_ids,
    encode_for_summarization,
-    fit_to_block_size,
+    truncate_or_pad,
 )
@@ -194,7 +195,7 @@ def build_data_iterator(args, tokenizer):
 def load_and_cache_examples(args, tokenizer):
-    dataset = SummarizationDataset(args.documents_dir)
+    dataset = CNNDMDataset(args.documents_dir)
    return dataset
@@ -211,7 +212,7 @@ def collate(data, tokenizer, block_size, device):
    encoded_text = [encode_for_summarization(story, summary, tokenizer) for _, story, summary in data]
    encoded_stories = torch.tensor(
-        [fit_to_block_size(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text]
+        [truncate_or_pad(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text]
    )
    encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
    encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)

--- a/examples/summarization/test_utils_summarization.py
+++ b/examples/summarization/test_utils_summarization.py
@@ -17,7 +17,7 @@ import unittest
 import numpy as np
 import torch
-from utils_summarization import build_mask, compute_token_type_ids, fit_to_block_size, process_story
+from .utils_summarization import build_mask, compute_token_type_ids, process_story, truncate_or_pad
 class SummarizationDataProcessingTest(unittest.TestCase):
@@ -28,19 +28,19 @@ class SummarizationDataProcessingTest(unittest.TestCase):
        """ Pad the sequence with 0 if the sequence is smaller than the block size."""
        sequence = [1, 2, 3, 4]
        expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
-        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
+        self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
    def test_fit_to_block_sequence_fit_exactly(self):
        """ Do nothing if the sequence is the right size. """
        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
+        self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
    def test_fit_to_block_sequence_too_big(self):
        """ Truncate the sequence if it is too long. """
        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
+        self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
    def test_process_story_no_highlights(self):
        """ Processing a story with no highlights returns an empty list for the summary.

--- a/examples/summarization/utils_summarization.py
+++ b/examples/summarization/utils_summarization.py
@@ -10,7 +10,7 @@ from torch.utils.data import Dataset
 # ------------
-class SummarizationDataset(Dataset):
+class CNNDMDataset(Dataset):
    """ Abstracts the dataset used to train seq2seq models.
    The class will process the documents that are located in the specified
@@ -62,11 +62,11 @@ class SummarizationDataset(Dataset):
 def process_story(raw_story):
    """ Extract the story and summary from a story file.
-    Attributes:
+    Arguments:
        raw_story (str): content of the story file as an utf-8 encoded string.
    Raises:
-        IndexError: If the stoy is empty or contains no highlights.
+        IndexError: If the story is empty or contains no highlights.
    """
    nonempty_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")]))
@@ -107,7 +107,7 @@ def _add_missing_period(line):
 # --------------------------
-def fit_to_block_size(sequence, block_size, pad_token_id):
+def truncate_or_pad(sequence, block_size, pad_token_id):
    """ Adapt the source and target sequences' lengths to the block size.
    If the sequence is shorter we append padding token to the right of the sequence.
    """

--- a/model_cards/DeepPavlov/bert-base-bg-cs-pl-ru-cased/README.md
+++ b/model_cards/DeepPavlov/bert-base-bg-cs-pl-ru-cased/README.md
+---
+language:
+- bulgarian
+- czech
+- polish
+- russian
+---
+# bert-base-bg-cs-pl-ru-cased
+SlavicBERT\[1\] \(Slavic \(bg, cs, pl, ru\), cased, 12-layer, 768-hidden, 12-heads, 180M parameters\) was trained
+on Russian News and four Wikipedias: Bulgarian, Czech, Polish, and Russian.
+Subtoken vocabulary was built using this data. Multilingual BERT was used as an initialization for SlavicBERT.
+\[1\]: Arkhipov M., Trofimova M., Kuratov Y., Sorokin A. \(2019\).
+[Tuning Multilingual Transformers for Language-Specific Named Entity Recognition](https://www.aclweb.org/anthology/W19-3712/).
+ACL anthology W19-3712.
--- a/model_cards/DeepPavlov/bert-base-cased-conversational/README.md
+++ b/model_cards/DeepPavlov/bert-base-cased-conversational/README.md
+---
+language:
+- english
+---
+# bert-base-cased-conversational
+Conversational BERT \(English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters\) was trained
+on the English part of Twitter, Reddit, DailyDialogues\[1\], OpenSubtitles\[2\], Debates\[3\], Blogs\[4\],
+Facebook News Comments. We used this training data to build the vocabulary of English subtokens and took
+English cased version of BERT-base as an initialization for English Conversational BERT.
+\[1\]: Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. DailyDialog: A Manually Labelled
+Multi-turn Dialogue Dataset. IJCNLP 2017.
+\[2\]: P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles.
+In Proceedings of the 10th International Conference on Language Resources and Evaluation \(LREC 2016\)
+\[3\]: Justine Zhang, Ravi Kumar, Sujith Ravi, Cristian Danescu-Niculescu-Mizil. Proceedings of NAACL, 2016.
+\[4\]: J. Schler, M. Koppel, S. Argamon and J. Pennebaker \(2006\). Effects of Age and Gender on Blogging
+in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs.
--- a/model_cards/DeepPavlov/bert-base-multilingual-cased-sentence/README.md
+++ b/model_cards/DeepPavlov/bert-base-multilingual-cased-sentence/README.md
+---
+language:
+- multilingual
+---
+# bert-base-multilingual-cased-sentence
+Sentence Multilingual BERT \(101 languages, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\)
+is a representation-based sentence encoder for 101 languages of Multilingual BERT.
+It is initialized with Multilingual BERT and then fine-tuned on english MultiNLI\[1\] and on dev set
+of multilingual XNLI\[2\].
+Sentence representations are mean pooled token embeddings in the same manner as in Sentence-BERT\[3\].
+\[1\]: Williams A., Nangia N. & Bowman S. \(2017\) A Broad-Coverage Challenge Corpus for Sentence Understanding
+through Inference. arXiv preprint [arXiv:1704.05426](https://arxiv.org/abs/1704.05426)
+\[2\]: Williams A., Bowman S. \(2018\) XNLI: Evaluating Cross-lingual Sentence Representations.
+arXiv preprint [arXiv:1809.05053](https://arxiv.org/abs/1809.05053)
+\[3\]: N. Reimers, I. Gurevych \(2019\) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks.
+arXiv preprint [arXiv:1908.10084](https://arxiv.org/abs/1908.10084)
--- a/model_cards/DeepPavlov/rubert-base-cased-conversational/README.md
+++ b/model_cards/DeepPavlov/rubert-base-cased-conversational/README.md
+---
+language:
+- russian
+---
+# rubert-base-cased-conversational
+Conversational RuBERT \(Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\) was trained
+on OpenSubtitles\[1\], [Dirty](https://d3.ru/), [Pikabu](https://pikabu.ru/),
+and a Social Media segment of Taiga corpus\[2\]. We assembled a new vocabulary for Conversational RuBERT model
+on this data and initialized the model with [RuBERT](../rubert-base-cased).
+\[1\]: P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles.
+In Proceedings of the 10th International Conference on Language Resources and Evaluation \(LREC 2016\)
+\[2\]: Shavrina T., Shapovalova O. \(2017\) TO THE METHODOLOGY OF CORPUS CONSTRUCTION FOR MACHINE LEARNING:
+«TAIGA» SYNTAX TREE CORPUS AND PARSER. in proc. of “CORPORA2017”, international conference , Saint-Petersbourg, 2017.
--- a/model_cards/DeepPavlov/rubert-base-cased-sentence/README.md
+++ b/model_cards/DeepPavlov/rubert-base-cased-sentence/README.md
+---
+language:
+- russian
+---
+# rubert-base-cased-sentence
+Sentence RuBERT \(Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\)
+is a representation-based sentence encoder for Russian. It is initialized with RuBERT and fine-tuned on SNLI\[1\]
+google-translated to russian and on russian part of XNLI dev set\[2\]. Sentence representations are mean pooled
+token embeddings in the same manner as in Sentence-BERT\[3\].
+\[1\]: S. R. Bowman, G. Angeli, C. Potts, and C. D. Manning. \(2015\) A large annotated corpus for learning
+natural language inference. arXiv preprint [arXiv:1508.05326](https://arxiv.org/abs/1508.05326)
+\[2\]: Williams A., Bowman S. \(2018\) XNLI: Evaluating Cross-lingual Sentence Representations.
+arXiv preprint [arXiv:1809.05053](https://arxiv.org/abs/1809.05053)
+\[3\]: N. Reimers, I. Gurevych \(2019\) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks.
+arXiv preprint [arXiv:1908.10084](https://arxiv.org/abs/1908.10084)
--- a/model_cards/DeepPavlov/rubert-base-cased/README.md
+++ b/model_cards/DeepPavlov/rubert-base-cased/README.md
+---
+language:
+- russian
+---
+# rubert-base-cased
+RuBERT \(Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\) was trained on the Russian part of Wikipedia
+and news data. We used this training data to build a vocabulary of Russian subtokens and took a multilingual version
+of BERT-base as an initialization for RuBERT\[1\].
+\[1\]: Kuratov, Y., Arkhipov, M. \(2019\). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language.
+arXiv preprint [arXiv:1905.07213](https://arxiv.org/abs/1905.07213).
--- a/model_cards/asafaya/bert-base-arabic/README.md
+++ b/model_cards/asafaya/bert-base-arabic/README.md
+---
+language: arabic
+---
+# Arabic BERT Model
+Pretrained BERT base language model for Arabic
+## Pretraining Corpus
+`arabic-bert-base` model was pretrained on ~8.2 Billion words:
+- Arabic version of [OSCAR](https://traces1.inria.fr/oscar/) - filtered from [Common Crawl](http://commoncrawl.org/)
+- Recent dump of Arabic [Wikipedia](https://dumps.wikimedia.org/backup-index.html)
+and other Arabic resources which sum up to ~95GB of text.
+__Notes on training data:__
+- Our final version of corpus contains some non-Arabic words inlines, which we did not remove from sentences since that would affect some tasks like NER.
+- Although non-Arabic characters were lowered as a preprocessing step, since Arabic characters does not have upper or lower case, there is no cased and uncased version of the model.
+- The corpus and vocabulary set are not restricted to Modern Standard Arabic, they contain some dialectical Arabic too.
+## Pretraining details
+- This model was trained using Google BERT's github [repository](https://github.com/google-research/bert) on a single TPU v3-8 provided for free from [TFRC](https://www.tensorflow.org/tfrc).
+- Our pretraining procedure follows training settings of bert with some changes: trained for 3M training steps with batchsize of 128, instead of 1M with batchsize of 256.
+## Load Pretrained Model
+You can use this model by installing `torch` or `tensorflow` and Huggingface library `transformers`. And you can use it directly by initializing it like this:  
+```python
+from transformers import AutoTokenizer, AutoModel
+tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
+model = AutoModel.from_pretrained("asafaya/bert-base-arabic")
+```
+## Results
+For further details on the models performance or any other queries, please refer to [Arabic-BERT](https://github.com/alisafaya/Arabic-BERT)
+## Acknowledgement
+Thanks to Google for providing free TPU for the training process and for Huggingface for hosting this model on their servers 😊
--- a/model_cards/aubmindlab/bert-base-arabert/README.md
+++ b/model_cards/aubmindlab/bert-base-arabert/README.md
+---
+language: arabic
+---
+# AraBERT : Pre-training BERT for Arabic Language Understanding
+**AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config.
+There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
+The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
+We evalaute both AraBERT models on different downstream tasks and compare it to [mBERT]((https://github.com/google-research/bert/blob/master/multilingual.md)), and other state of the art models (*To the extent of our knowledge*). The Tasks were Sentiment Analysis on 6 different datasets ([HARD](https://github.com/elnagara/HARD-Arabic-Dataset), [ASTD-Balanced](https://www.aclweb.org/anthology/D15-1299), [ArsenTD-Lev](https://staff.aub.edu.lb/~we07/Publications/ArSentD-LEV_Sentiment_Corpus.pdf), [LABR](https://github.com/mohamedadaly/LABR), [ArSaS](http://lrec-conf.org/workshops/lrec2018/W30/pdf/22_W30.pdf)), Named Entity Recognition with the [ANERcorp](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp), and Arabic Question Answering on [Arabic-SQuAD and ARCD](https://github.com/husseinmozannar/SOQAL)
+## Results (Acc.)
+Task | prev. SOTA | mBERT | AraBERTv0.1 | AraBERTv1
+---|:---:|:---:|:---:|:---:
+HARD |95.7 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)|95.7|96.2|96.1
+ASTD |86.5 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)| 80.1|92.2|92.6
+ArsenTD-Lev|52.4 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)|51|58.9|59.4
+AJGT|93 [Dahou et.al.](https://dl.acm.org/doi/fullHtml/10.1145/3314941)| 83.6|94.1|93.8
+LABR|87.5 [Dahou et.al.](https://dl.acm.org/doi/fullHtml/10.1145/3314941)|83|85.9|86.7
+ANERcorp|81.7 (BiLSTM-CRF)|78.4|84.2|81.9
+ARCD|mBERT|EM:34.2 F1: 61.3|EM:30.1 F1:61.2|EM:30.6 F1: 62.7
+*We would be extremly thankful if everyone can contibute to the Results table by adding more scores on different datasets*
+## How to use
+You can easily use AraBERT since it is almost fully compatible with existing codebases (You can use this repo instead of the official BERT one, the only difference is in the ```tokenization.py``` file where we modify the _is_punctuation function to make it compatible with the "+" symbol and the "[" and "]" characters)
+To use HuggingFace's Transformer repository you only need to provide a lost of token that forces the model to not split them, also make sure that the text is pre-segmented:
+```python
+from transformers import AutoTokenizer
+from preprocess_arabert import never_split_tokens
+arabert_tokenizer = AutoTokenizer.from_pretrained(
+    "aubmindlab/bert-base-arabert",
+    do_lower_case=False,
+    do_basic_tokenize=True,
+    never_split=never_split_tokens)
+arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabert")
+arabert_tokenizer.tokenize("و+ لن نبالغ إذا قل +نا إن هاتف أو كمبيوتر ال+ مكتب في زمن +نا هذا ضروري")
+>>> ['و+', 'لن', 'نبال', '##غ', 'إذا', 'قل', '+نا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'ال+', 'مكتب', 'في', 'زمن', '+نا', 'هذا', 'ضروري']
+```
+**AraBERTv0.1 is compatible with all existing libraries, since it needs no pre-segmentation.**
+```python
+from transformers import AutoTokenizer
+from preprocess_arabert import never_split_tokens
+arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv01",do_lower_case=False)
+arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv01")
+arabert_tokenizer.tokenize("ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري")
+>>> ['ولن', 'ن', '##بالغ', 'إذا', 'قلنا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'المكتب', 'في', 'زمن', '##ن', '##ا', 'هذا', 'ضروري']
+```
+The ```araBERT_(initial_Demo_TF)_.ipynb``` Notebook is a small demo using the AJGT dataset using TensorFlow (GPU and TPU compatible).
+## Model Weights and Vocab Download
+Models | AraBERTv0.1 | AraBERTv1
+---|:---:|:---:
+TensorFlow|[Drive Link](https://drive.google.com/open?id=1-kVmTUZZ4DP2rzeHNjTPkY8OjnQCpomO) | [Drive Link](https://drive.google.com/open?id=1-d7-9ljKgDJP5mx73uBtio-TuUZCqZnt)
+PyTorch| [Drive_Link](https://drive.google.com/open?id=1-_3te42mQCPD8SxwZ3l-VBL7yaJH-IOv)| [Drive_Link](https://drive.google.com/open?id=1-69s6Pxqbi63HOQ1M9wTcr-Ovc6PWLLo)
+**You can find the PyTorch models in HuggingFace's Transformer Library under the ```aubmindlab``` username**
+## If you used this model please cite us as:
+```
+@misc{antoun2020arabert,
+    title={AraBERT: Transformer-based Model for Arabic Language Understanding},
+    author={Wissam Antoun and Fady Baly and Hazem Hajj},
+    year={2020},
+    eprint={2003.00104},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+## Acknowledgments 
+Thanks to TensorFlow Research Cloud (TFRC) for the free access to Cloud TPUs, couldn't have done it without this program, and to the [AUB MIND Lab](https://sites.aub.edu.lb/mindlab/) Members for the continous support. Also thanks to [Yakshof](https://www.yakshof.com/#/) and Assafir for data and storage access.
+## Contacts
+**Wissam Antoun**: [Linkedin](https://www.linkedin.com/in/giulio-ravasio-3a81a9110/) | [Twitter](https://twitter.com/wissam_antoun) | [Github](https://github.com/WissamAntoun) | <wfa07@mail.aub.edu> | <wissam.antoun@gmail.com>
+**Fady Baly**: [Linkedin](https://www.linkedin.com/in/fadybaly/) | [Twitter](https://twitter.com/BalyFady) | [Github](https://github.com/fadybaly) | <fgb06@mail.aub.edu> | <baly.fady@gmail.com>
+***We are looking for sponsors to train BERT-Large and other Transformer models, the sponsor only needs to cover to data storage and compute cost of the generating the pretraining data***
--- a/model_cards/aubmindlab/bert-base-arabertv01/README.md
+++ b/model_cards/aubmindlab/bert-base-arabertv01/README.md
+---
+language: arabic
+---
+# AraBERT : Pre-training BERT for Arabic Language Understanding
+**AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config.
+There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
+The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
+We evalaute both AraBERT models on different downstream tasks and compare it to [mBERT]((https://github.com/google-research/bert/blob/master/multilingual.md)), and other state of the art models (*To the extent of our knowledge*). The Tasks were Sentiment Analysis on 6 different datasets ([HARD](https://github.com/elnagara/HARD-Arabic-Dataset), [ASTD-Balanced](https://www.aclweb.org/anthology/D15-1299), [ArsenTD-Lev](https://staff.aub.edu.lb/~we07/Publications/ArSentD-LEV_Sentiment_Corpus.pdf), [LABR](https://github.com/mohamedadaly/LABR), [ArSaS](http://lrec-conf.org/workshops/lrec2018/W30/pdf/22_W30.pdf)), Named Entity Recognition with the [ANERcorp](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp), and Arabic Question Answering on [Arabic-SQuAD and ARCD](https://github.com/husseinmozannar/SOQAL)
+## Results (Acc.)
+Task | prev. SOTA | mBERT | AraBERTv0.1 | AraBERTv1
+---|:---:|:---:|:---:|:---:
+HARD |95.7 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)|95.7|96.2|96.1
+ASTD |86.5 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)| 80.1|92.2|92.6
+ArsenTD-Lev|52.4 [ElJundi et.al.](https://www.aclweb.org/anthology/W19-4608/)|51|58.9|59.4
+AJGT|93 [Dahou et.al.](https://dl.acm.org/doi/fullHtml/10.1145/3314941)| 83.6|94.1|93.8
+LABR|87.5 [Dahou et.al.](https://dl.acm.org/doi/fullHtml/10.1145/3314941)|83|85.9|86.7
+ANERcorp|81.7 (BiLSTM-CRF)|78.4|84.2|81.9
+ARCD|mBERT|EM:34.2 F1: 61.3|EM:30.1 F1:61.2|EM:30.6 F1: 62.7
+*We would be extremly thankful if everyone can contibute to the Results table by adding more scores on different datasets*
+## How to use
+You can easily use AraBERT since it is almost fully compatible with existing codebases (You can use this repo instead of the official BERT one, the only difference is in the ```tokenization.py``` file where we modify the _is_punctuation function to make it compatible with the "+" symbol and the "[" and "]" characters)
+To use HuggingFace's Transformer repository you only need to provide a lost of token that forces the model to not split them, also make sure that the text is pre-segmented:
+```python
+from transformers import AutoTokenizer
+from preprocess_arabert import never_split_tokens
+arabert_tokenizer = AutoTokenizer.from_pretrained(
+    "aubmindlab/bert-base-arabert",
+    do_lower_case=False,
+    do_basic_tokenize=True,
+    never_split=never_split_tokens)
+arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabert")
+arabert_tokenizer.tokenize("و+ لن نبالغ إذا قل +نا إن هاتف أو كمبيوتر ال+ مكتب في زمن +نا هذا ضروري")
+>>> ['و+', 'لن', 'نبال', '##غ', 'إذا', 'قل', '+نا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'ال+', 'مكتب', 'في', 'زمن', '+نا', 'هذا', 'ضروري']
+```
+**AraBERTv0.1 is compatible with all existing libraries, since it needs no pre-segmentation.**
+```python
+from transformers import AutoTokenizer
+from preprocess_arabert import never_split_tokens
+arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv01",do_lower_case=False)
+arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv01")
+arabert_tokenizer.tokenize("ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري")
+>>> ['ولن', 'ن', '##بالغ', 'إذا', 'قلنا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'المكتب', 'في', 'زمن', '##ن', '##ا', 'هذا', 'ضروري']
+```
+The ```araBERT_(initial_Demo_TF)_.ipynb``` Notebook is a small demo using the AJGT dataset using TensorFlow (GPU and TPU compatible).
+## Model Weights and Vocab Download
+Models | AraBERTv0.1 | AraBERTv1
+---|:---:|:---:
+TensorFlow|[Drive Link](https://drive.google.com/open?id=1-kVmTUZZ4DP2rzeHNjTPkY8OjnQCpomO) | [Drive Link](https://drive.google.com/open?id=1-d7-9ljKgDJP5mx73uBtio-TuUZCqZnt)
+PyTorch| [Drive_Link](https://drive.google.com/open?id=1-_3te42mQCPD8SxwZ3l-VBL7yaJH-IOv)| [Drive_Link](https://drive.google.com/open?id=1-69s6Pxqbi63HOQ1M9wTcr-Ovc6PWLLo)
+**You can find the PyTorch models in HuggingFace's Transformer Library under the ```aubmindlab``` username**
+## If you used this model please cite us as:
+```
+@misc{antoun2020arabert,
+    title={AraBERT: Transformer-based Model for Arabic Language Understanding},
+    author={Wissam Antoun and Fady Baly and Hazem Hajj},
+    year={2020},
+    eprint={2003.00104},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+## Acknowledgments 
+Thanks to TensorFlow Research Cloud (TFRC) for the free access to Cloud TPUs, couldn't have done it without this program, and to the [AUB MIND Lab](https://sites.aub.edu.lb/mindlab/) Members for the continous support. Also thanks to [Yakshof](https://www.yakshof.com/#/) and Assafir for data and storage access.
+## Contacts
+**Wissam Antoun**: [Linkedin](https://www.linkedin.com/in/giulio-ravasio-3a81a9110/) | [Twitter](https://twitter.com/wissam_antoun) | [Github](https://github.com/WissamAntoun) | <wfa07@mail.aub.edu> | <wissam.antoun@gmail.com>
+**Fady Baly**: [Linkedin](https://www.linkedin.com/in/fadybaly/) | [Twitter](https://twitter.com/BalyFady) | [Github](https://github.com/fadybaly) | <fgb06@mail.aub.edu> | <baly.fady@gmail.com>
+***We are looking for sponsors to train BERT-Large and other Transformer models, the sponsor only needs to cover to data storage and compute cost of the generating the pretraining data***
--- a/model_cards/djstrong/bg_cs_pl_ru_cased_L-12_H-768_A-12/README.md
+++ b/model_cards/djstrong/bg_cs_pl_ru_cased_L-12_H-768_A-12/README.md
+Slavic BERT from https://github.com/deepmipt/Slavic-BERT-NER http://files.deeppavlov.ai/deeppavlov_data/bg_cs_pl_ru_cased_L-12_H-768_A-12.tar.gz
--- a/model_cards/fmikaelian/camembert-base-fquad/README.md
+++ b/model_cards/fmikaelian/camembert-base-fquad/README.md
+---
+language: french
+---
+# camembert-base-fquad
+## Description
+A baseline model for question-answering in french ([CamemBERT](https://camembert-model.fr/) model fine-tuned on [FQuAD](https://fquad.illuin.tech/))
+## Training hyperparameters
+```shell
+python3 ./examples/run_squad.py \
+--model_type camembert \
+--model_name_or_path camembert-base \
+--do_train \
+--do_eval \
+--do_lower_case \
+--train_file train.json \
+--predict_file valid.json \
+--learning_rate 3e-5 \
+--num_train_epochs 2 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir output \
+--per_gpu_eval_batch_size=3 \
+--per_gpu_train_batch_size=3 \
+--save_steps 10000
+``` 
+## Evaluation results
+```shell
+{"f1": 77.24515316052342, "exact_match": 52.82308657465496}
+```
+## Usage
+```python
+from transformers import pipeline
+nlp = pipeline('question-answering', model='fmikaelian/camembert-base-fquad', tokenizer='fmikaelian/camembert-base-fquad')
+nlp({
+    'question': "Qui est Claude Monet?",
+    'context': "Claude Monet, né le 14 novembre 1840 à Paris et mort le 5 décembre 1926 à Giverny, est un peintre français et l’un des fondateurs de l'impressionnisme."
+})
+```
\ No newline at end of file
--- a/model_cards/fmikaelian/camembert-base-squad/README.md
+++ b/model_cards/fmikaelian/camembert-base-squad/README.md
+---
+language: french
+---
+# camembert-base-squad
+## Description
+A baseline model for question-answering in french ([CamemBERT](https://camembert-model.fr/) model fine-tuned on [french-translated SQuAD 1.1 dataset](https://github.com/Alikabbadj/French-SQuAD))
+## Training hyperparameters
+```shell
+python3 ./examples/run_squad.py \
+--model_type camembert \
+--model_name_or_path camembert-base \
+--do_train \
+--do_eval \
+--do_lower_case \
+--train_file SQuAD-v1.1-train_fr_ss999_awstart2_net.json \
+--predict_file SQuAD-v1.1-dev_fr_ss999_awstart2_net.json \
+--learning_rate 3e-5 \
+--num_train_epochs 2 \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir output3 \
+--per_gpu_eval_batch_size=3 \
+--per_gpu_train_batch_size=3 \
+--save_steps 10000
+``` 
+## Evaluation results
+```shell
+{"f1": 79.8570684959745, "exact_match": 59.21327108373895}
+```
+## Usage
+```python
+from transformers import pipeline
+nlp = pipeline('question-answering', model='fmikaelian/camembert-base-squad', tokenizer='fmikaelian/camembert-base-squad')
+nlp({
+    'question': "Qui est Claude Monet?",
+    'context': "Claude Monet, né le 14 novembre 1840 à Paris et mort le 5 décembre 1926 à Giverny, est un peintre français et l’un des fondateurs de l'impressionnisme."
+})
+```
\ No newline at end of file
--- a/model_cards/lvwerra/gpt2-medium-taboo/README.md
+++ b/model_cards/lvwerra/gpt2-medium-taboo/README.md
+# GPT-2 (medium) Taboo
+## What is it?
+A fine-tuned GPT-2 version for Taboo cards generation.
+## Training setting
+The model was trained on ~900 Taboo cards in the following format for 100 epochs:
+```
+Describe the word Glitch without using the words Problem, Unexpected, Technology, Minor, Outage.
+````
--- a/model_cards/mrm8488/bert-multi-cased-finetuned-xquadv1/README.md
+++ b/model_cards/mrm8488/bert-multi-cased-finetuned-xquadv1/README.md
+---
+language: multilingual
+thumbnail:
+---
+# BERT (base-multilingual-cased) fine-tuned on XQuAD
+This model was created by [Google](https://github.com/google-research/bert/blob/master/multilingual.md) and fine-tuned on [XQuAD](https://github.com/deepmind/xquad) for multilingual (`11 different languages`) **Q&A** downstream task.
+## Details of the language model('bert-base-multilingual-cased')
+[Language model](https://github.com/google-research/bert/blob/master/multilingual.md)
+| Languages | Heads | Layers | Hidden | Params |
+| --------- | ----- | ------ | ------ | ------ |
+| 104       | 12    | 12     | 768    | 100 M  |
+## Details of the downstream task (multilingual Q&A) - Dataset
+Deepmind [XQuAD](https://github.com/deepmind/xquad)
+Languages covered:
+- Arabic: `ar`
+- German: `de`
+- Greek: `el`
+- English: `en`
+- Spanish: `es`
+- Hindi: `hi`
+- Russian: `ru`
+- Thai: `th`
+- Turkish: `tr`
+- Vietnamese: `vi`
+- Chinese: `zh`
+As the dataset is based on SQuAD v1.1, there are no unanswerable questions in the data. We chose this
+setting so that models can focus on cross-lingual transfer.
+We show the average number of tokens per paragraph, question, and answer for each language in the
+table below. The statistics were obtained using [Jieba](https://github.com/fxsjy/jieba) for Chinese
+and the [Moses tokenizer](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl)
+for the other languages.
+|           |  en   |  es   |  de   |  el   |  ru   |  tr   |  ar   |  vi   |  th   |  zh   |  hi   |
+| --------- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+| Paragraph | 142.4 | 160.7 | 139.5 | 149.6 | 133.9 | 126.5 | 128.2 | 191.2 | 158.7 | 147.6 | 232.4 |
+| Question  | 11.5  | 13.4  | 11.0  | 11.7  | 10.0  |  9.8  | 10.7  | 14.8  | 11.5  | 10.5  | 18.7  |
+| Answer    |  3.1  |  3.6  |  3.0  |  3.3  |  3.1  |  3.1  |  3.1  |  4.5  |  4.1  |  3.5  |  5.6  |
+Citation:
+<details>
+```
+@article{Artetxe:etal:2019,
+      author    = {Mikel Artetxe and Sebastian Ruder and Dani Yogatama},
+      title     = {On the cross-lingual transferability of monolingual representations},
+      journal   = {CoRR},
+      volume    = {abs/1910.11856},
+      year      = {2019},
+      archivePrefix = {arXiv},
+      eprint    = {1910.11856}
+}
+```
+</details>
+I used `Data augmentation techniques` and splited the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
+| Dataset     | # samples |
+| ----------- | --------- |
+| XQUAD train | 50 K      |
+| XQUAD test  | 8 K       |
+## Model training
+The model was trained on a Tesla P100 GPU and 25GB of RAM.
+The script for fine tuning can be found [here](https://github.com/huggingface/transformers/blob/master/examples/distillation/run_squad_w_distillation.py)
+## Results:
+| Metric    | # Value     |
+| --------- | ----------- |
+| **Exact** | **91.43** |
+| **F1**    | **94.14** |
+## Model in action
+Fast usage with **pipelines**:
+```python
+from transformers import pipeline
+from transformers import pipeline
+qa_pipeline = pipeline(
+    "question-answering",
+    model="mrm8488/bert-multi-cased-finetuned-xquadv1",
+    tokenizer="mrm8488/bert-multi-cased-finetuned-xquadv1"
+)
+# context: Coronavirus is seeding panic in the West because it expands so fast.
+# question: Where is seeding panic Coronavirus?
+qa_pipeline({
+    'context': "कोरोनावायरस पश्चिम में आतंक बो रहा है क्योंकि यह इतनी तेजी से फैलता है।",
+    'question': "कोरोनावायरस घबराहट कहां है?"
+})
+# output: {'answer': 'पश्चिम', 'end': 18, 'score': 0.7037217439689059, 'start': 12}
+qa_pipeline({
+    'context': "Manuel Romero has been working hardly in the repository hugginface/transformers lately",
+    'question': "Who has been working hard for hugginface/transformers lately?"
+})
+# output: {'answer': 'Manuel Romero', 'end': 13, 'score': 0.7254485993702389, 'start': 0}
+qa_pipeline({
+    'context': "Manuel Romero a travaillé à peine dans le référentiel hugginface / transformers ces derniers temps",
+    'question': "Pour quel référentiel a travaillé Manuel Romero récemment?"
+})
+#output: {'answer': 'hugginface / transformers', 'end': 79, 'score': 0.6482061613915384, 'start': 54}
+```
+![model in action](https://media.giphy.com/media/MBlire8Wj7ng73VBQ5/giphy.gif)
+Try it on a Colab:
+<a href="https://colab.research.google.com/github/mrm8488/shared_colab_notebooks/blob/master/Try_mrm8488_xquad_finetuned_model.ipynb" target="_parent"><img src="https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667" alt="Open In Colab" data-canonical-src="https://colab.research.google.com/assets/colab-badge.svg"></a>
+> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
+> Made with <span style="color: #e25555;">&hearts;</span> in Spain
--- a/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md
+++ b/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md
@@ -19,23 +19,30 @@ I preprocessed the dataset and splitted it as train / dev (80/20)
 | Dev                    | 2.2 K |
- [Fine-tune on NER script](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py)
+- [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py)
-```bash
-!export NER_DIR='/content/ner_dataset'
-!python /content/transformers/examples/run_ner.py \
-  --model_type bert \
-  --model_name_or_path dccuchile/bert-base-spanish-wwm-cased \
-  --do_train \
-  --do_eval \
-  --data_dir '/content/ner_dataset' \
-  --num_train_epochs 15.0 \
-  --max_seq_length 384 \
-  --output_dir /content/model_output \
-  --save_steps 5000 \
+- Labels covered:
+```
+B-LOC
+B-MISC
+B-ORG
+B-PER
+I-LOC
+I-MISC
+I-ORG
+I-PER
+O
 ```
+## Metrics on evaluation set:
+|                                                      Metric                                                       |  # score  |
+| :------------------------------------------------------------------------------------: | :-------: |
+| F1                                       | **90.17**  
+| Precision                                | **89.86** | 
+| Recall                                   | **90.47** |    
 ## Comparison:
 |                                                      Model                                                       |  # score  |
@@ -44,13 +51,24 @@ I preprocessed the dataset and splitted it as train / dev (80/20)
 | [bert-spanish-cased-finetuned-ner (this one)](https://huggingface.co/mrm8488/bert-spanish-cased-finetuned-ner) | **89.65** |
 |                                              Best Multilingual BERT                                              |   87.38   |
-```
+## Model in action
- ***** All metrics on Eval results  *****
+Fast usage with **pipelines**:
+```python
+from transformers import pipeline
+nlp_ner = pipeline(
+    "ner",
+    model="mrm8488/bert-spanish-cased-finetuned-ner",
+    tokenizer=(
+        'mrm8488/bert-spanish-cased-finetuned-ner',  
+        {"use_fast": False}
+))
+nlp_ner(text)
-f1 = 0.8965040489828165
+#Output: [{'entity': 'B-LOC', 'score': 0.9998720288276672, 'word': 'Londres'}]
-loss = 0.11504213575173258
-precision = 0.893679858239811
-recall = 0.8993461462254805
 ```
 > Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)