getting ready

ccce66be · thomwolf · 43badf21 · ccce66be · ccce66be · ccce66be
Commit ccce66be authored Oct 30, 2018 by thomwolf
Showing with 595 additions and 104 deletions

.gitignore .gitignore +2 -104

bert_model.py bert_model.py +481 -0

data_processor.py data_processor.py +89 -0

download_weights.sh download_weights.sh +5 -0

example.py example.py +18 -0

No files found.
--- a/.gitignore
+++ b/.gitignore
-# Byte-compiled / optimized / DLL files
+# VSCode
-__pycache__/
+.vscode
-*.py[cod]
\ No newline at end of file
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# pyenv
-.python-version
-# celery beat schedule file
-celerybeat-schedule
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
--- a/bert_model.py
+++ b/bert_model.py
--- a/data_processor.py
+++ b/data_processor.py
+"""
+Prepare input data for Google's BERT Model.
+Contains some functions from tensor2tensor library: https://github.com/tensorflow/tensor2tensor
+"""
+from typing import NamedTuple, List, Union, Tuple
+TokenizedSentence = List[str]
+TokenizedInput = Union[Tuple[TokenizedSentence, TokenizedSentence], TokenizedSentence]
+class DataProcessor():
+    def __init__(self, vocab_path):
+        self.encoder_file_path = encoder_file_path
+        self.token_indexer = json.load(open(vocab_path))
+    def tokenize(text):
+    """Encode a unicode string as a list of tokens.
+    Args:
+        text: a unicode string
+    Returns:
+        a list of tokens as Unicode strings
+    """
+    if not text:
+        return []
+    ret = []
+    token_start = 0
+    # Classify each character in the input string
+    is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text]
+    for pos in range(1, len(text)):
+        if is_alnum[pos] != is_alnum[pos - 1]:
+        token = text[token_start:pos]
+        if token != u" " or token_start == 0:
+            ret.append(token)
+        token_start = pos
+    final_token = text[token_start:]
+    ret.append(final_token)
+    return ret
+    def detokenize(tokens):
+    """Decode a list of tokens to a unicode string.
+    Args:
+        tokens: a list of Unicode strings
+    Returns:
+        a unicode string
+    """
+    token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens]
+    ret = []
+    for i, token in enumerate(tokens):
+        if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:
+        ret.append(u" ")
+        ret.append(token)
+    return "".join(ret)
+    def encode(input_sentences: List[TokenizedInput]) -> np.array:
+        """ Prepare a torch.Tensor of inputs for BERT model from a string.
+        Args:
+            input_sentences: list of
+                - pairs of tokenized sentences (sentence_A, sentence_B) or
+                - tokenized sentences (will be considered as sentence_A only)
+        Return:
+            Numpy array of formated inputs for BERT model
+        """
+        batch_size = sum(min(len(x), n_perso_permute) for x in X1)
+        input_mask = np.zeros((n_batch, n_cands, n_ctx), dtype=np.float32)
+        input_array = np.zeros((n_batch, n_cands, n_ctx, 3), dtype=np.int32)
+        i = 0
+        for tokenized_input in input_sentences:
+            x1j, lxj, lperso, lhisto, dialog_embed = format_transformer_input(x1, x2, xcand_j, text_encoder,
+                                                                                dialog_embed_mode, max_len=max_len,
+                                                                                add_start_stop=True)
+            lmj = len(xcand_j[:max_len]) + 1
+            xmb[i, j, :lxj, 0] = x1j
+            if dialog_embed_mode == 1 or dialog_embed_mode == 2:
+                xmb[i, j, :lxj, 2] = dialog_embed
+            mmb[i, j, :lxj] = 1
+            if fix_lm_index: # Take one before so we don't predict from classify token...
+                mmb_eval[i, j, (lxj-lmj-1):lxj-1] = 1 # This one only mask the response so we get the perplexity on the response only
+            else:
+                mmb_eval[i, j, (lxj-lmj):lxj] = 1 # This one only mask the response so we get the perplexity on the response only
+            xmb[i, j, :, 1] = np.arange(n_vocab+n_special, n_vocab+n_special+n_ctx)
+            i += 1
+        return input_array, input_mask
\ No newline at end of file
--- a/download_weights.sh
+++ b/download_weights.sh
+echo "=== Downloading BERT pre-trained weights ==="
+echo "---"
+wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
+tar -xzf simple-examples.tgz
+rm -rf simple-examples.tgz
--- a/example.py
+++ b/example.py
+"""
+Show how to use HuggingFace's PyTorch implementation of Google's BERT Model.
+"""
+from .bert_model import BERT
+from .prepare_inputs import DataPreprocessor
+bert_model = BERT()
+bert_model.load_from('.')
+data_processor = DataProcessor(encoder_file_path='.')
+input_sentence = "We are playing with the BERT model."
+print("BERT inputs: {}".format(input_sentence))
+tensor_input = data_processor.encode(input_sentence)
+tensor_output = bert_model(prepared_input)
+output_sentence = data_processor.decode(tensor_output)
+print("BERT predicted: {}".format(output_sentence))