"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "871ba71dfa04f9d37a4f32e1f962a1199a5cf51a"
Commit ccce66be authored by thomwolf's avatar thomwolf
Browse files

getting ready

parent 43badf21
# Byte-compiled / optimized / DLL files # VSCode
__pycache__/ .vscode
*.py[cod] \ No newline at end of file
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
This diff is collapsed.
"""
Prepare input data for Google's BERT Model.
Contains some functions from tensor2tensor library: https://github.com/tensorflow/tensor2tensor
"""
from typing import NamedTuple, List, Union, Tuple
TokenizedSentence = List[str]
TokenizedInput = Union[Tuple[TokenizedSentence, TokenizedSentence], TokenizedSentence]
class DataProcessor():
def __init__(self, vocab_path):
self.encoder_file_path = encoder_file_path
self.token_indexer = json.load(open(vocab_path))
def tokenize(text):
"""Encode a unicode string as a list of tokens.
Args:
text: a unicode string
Returns:
a list of tokens as Unicode strings
"""
if not text:
return []
ret = []
token_start = 0
# Classify each character in the input string
is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text]
for pos in range(1, len(text)):
if is_alnum[pos] != is_alnum[pos - 1]:
token = text[token_start:pos]
if token != u" " or token_start == 0:
ret.append(token)
token_start = pos
final_token = text[token_start:]
ret.append(final_token)
return ret
def detokenize(tokens):
"""Decode a list of tokens to a unicode string.
Args:
tokens: a list of Unicode strings
Returns:
a unicode string
"""
token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens]
ret = []
for i, token in enumerate(tokens):
if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:
ret.append(u" ")
ret.append(token)
return "".join(ret)
def encode(input_sentences: List[TokenizedInput]) -> np.array:
""" Prepare a torch.Tensor of inputs for BERT model from a string.
Args:
input_sentences: list of
- pairs of tokenized sentences (sentence_A, sentence_B) or
- tokenized sentences (will be considered as sentence_A only)
Return:
Numpy array of formated inputs for BERT model
"""
batch_size = sum(min(len(x), n_perso_permute) for x in X1)
input_mask = np.zeros((n_batch, n_cands, n_ctx), dtype=np.float32)
input_array = np.zeros((n_batch, n_cands, n_ctx, 3), dtype=np.int32)
i = 0
for tokenized_input in input_sentences:
x1j, lxj, lperso, lhisto, dialog_embed = format_transformer_input(x1, x2, xcand_j, text_encoder,
dialog_embed_mode, max_len=max_len,
add_start_stop=True)
lmj = len(xcand_j[:max_len]) + 1
xmb[i, j, :lxj, 0] = x1j
if dialog_embed_mode == 1 or dialog_embed_mode == 2:
xmb[i, j, :lxj, 2] = dialog_embed
mmb[i, j, :lxj] = 1
if fix_lm_index: # Take one before so we don't predict from classify token...
mmb_eval[i, j, (lxj-lmj-1):lxj-1] = 1 # This one only mask the response so we get the perplexity on the response only
else:
mmb_eval[i, j, (lxj-lmj):lxj] = 1 # This one only mask the response so we get the perplexity on the response only
xmb[i, j, :, 1] = np.arange(n_vocab+n_special, n_vocab+n_special+n_ctx)
i += 1
return input_array, input_mask
\ No newline at end of file
echo "=== Downloading BERT pre-trained weights ==="
echo "---"
wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
tar -xzf simple-examples.tgz
rm -rf simple-examples.tgz
"""
Show how to use HuggingFace's PyTorch implementation of Google's BERT Model.
"""
from .bert_model import BERT
from .prepare_inputs import DataPreprocessor
bert_model = BERT()
bert_model.load_from('.')
data_processor = DataProcessor(encoder_file_path='.')
input_sentence = "We are playing with the BERT model."
print("BERT inputs: {}".format(input_sentence))
tensor_input = data_processor.encode(input_sentence)
tensor_output = bert_model(prepared_input)
output_sentence = data_processor.decode(tensor_output)
print("BERT predicted: {}".format(output_sentence))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment