"vscode:/vscode.git/clone" did not exist on "a86ee2261e3a6c5915220c2c74829f9485803d63"
Unverified Commit 03cdb2a3 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #254 from huggingface/python_2

Adding OpenAI GPT and Transformer-XL models, compatibility with Python 2
parents 2dfaf2f2 1e71f11d
...@@ -14,14 +14,13 @@ ...@@ -14,14 +14,13 @@
# limitations under the License. # limitations under the License.
"""Tokenization classes.""" """Tokenization classes."""
from __future__ import absolute_import from __future__ import absolute_import, division, print_function, unicode_literals
from __future__ import division
from __future__ import print_function
import collections import collections
import unicodedata
import os
import logging import logging
import os
import unicodedata
from io import open
from .file_utils import cached_path from .file_utils import cached_path
...@@ -117,26 +116,26 @@ class BertTokenizer(object): ...@@ -117,26 +116,26 @@ class BertTokenizer(object):
return tokens return tokens
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
""" """
Instantiate a PreTrainedBertModel from a pre-trained model file. Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed. Download and cache the pre-trained model file if needed.
""" """
if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP: if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name] vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
else: else:
vocab_file = pretrained_model_name vocab_file = pretrained_model_name_or_path
if os.path.isdir(vocab_file): if os.path.isdir(vocab_file):
vocab_file = os.path.join(vocab_file, VOCAB_NAME) vocab_file = os.path.join(vocab_file, VOCAB_NAME)
# redirect to the cache, if necessary # redirect to the cache, if necessary
try: try:
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
except FileNotFoundError: except EnvironmentError:
logger.error( logger.error(
"Model name '{}' was not found in model name list ({}). " "Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file " "We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format( "associated to this path or url.".format(
pretrained_model_name, pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
vocab_file)) vocab_file))
return None return None
...@@ -145,10 +144,10 @@ class BertTokenizer(object): ...@@ -145,10 +144,10 @@ class BertTokenizer(object):
else: else:
logger.info("loading vocabulary file {} from cache at {}".format( logger.info("loading vocabulary file {} from cache at {}".format(
vocab_file, resolved_vocab_file)) vocab_file, resolved_vocab_file))
if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings # than the number of positional embeddings
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name] max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
# Instantiate tokenizer. # Instantiate tokenizer.
tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
......
This diff is collapsed.
This diff is collapsed.
...@@ -33,12 +33,13 @@ To create the package for pypi. ...@@ -33,12 +33,13 @@ To create the package for pypi.
7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory. 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
""" """
from io import open
from setuptools import find_packages, setup from setuptools import find_packages, setup
setup( setup(
name="pytorch_pretrained_bert", name="pytorch_pretrained_bert",
version="0.4.0", version="0.5.0",
author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors", author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
author_email="thomas@huggingface.co", author_email="thomas@huggingface.co",
description="PyTorch version of Google AI BERT model with script to load Google pre-trained models", description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
long_description=open("README.md", "r", encoding='utf-8').read(), long_description=open("README.md", "r", encoding='utf-8').read(),
...@@ -55,10 +56,10 @@ setup( ...@@ -55,10 +56,10 @@ setup(
'tqdm'], 'tqdm'],
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
"pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main" "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
] ]
}, },
python_requires='>=3.5.0', # python_requires='>=3.5.0',
tests_require=['pytest'], tests_require=['pytest'],
classifiers=[ classifiers=[
'Intended Audience :: Science/Research', 'Intended Audience :: Science/Research',
......
This diff is collapsed.
...@@ -114,6 +114,7 @@ class BertModelTest(unittest.TestCase): ...@@ -114,6 +114,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertModel(config=config) model = BertModel(config=config)
model.eval()
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
"sequence_output": all_encoder_layers[-1], "sequence_output": all_encoder_layers[-1],
...@@ -134,6 +135,7 @@ class BertModelTest(unittest.TestCase): ...@@ -134,6 +135,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertForMaskedLM(config=config) model = BertForMaskedLM(config=config)
model.eval()
loss = model(input_ids, token_type_ids, input_mask, token_labels) loss = model(input_ids, token_type_ids, input_mask, token_labels)
prediction_scores = model(input_ids, token_type_ids, input_mask) prediction_scores = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
...@@ -149,6 +151,7 @@ class BertModelTest(unittest.TestCase): ...@@ -149,6 +151,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertForNextSentencePrediction(config=config) model = BertForNextSentencePrediction(config=config)
model.eval()
loss = model(input_ids, token_type_ids, input_mask, sequence_labels) loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
seq_relationship_score = model(input_ids, token_type_ids, input_mask) seq_relationship_score = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
...@@ -165,6 +168,7 @@ class BertModelTest(unittest.TestCase): ...@@ -165,6 +168,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertForPreTraining(config=config) model = BertForPreTraining(config=config)
model.eval()
loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels) loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask) prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
...@@ -185,6 +189,7 @@ class BertModelTest(unittest.TestCase): ...@@ -185,6 +189,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertForQuestionAnswering(config=config) model = BertForQuestionAnswering(config=config)
model.eval()
loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels) loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
start_logits, end_logits = model(input_ids, token_type_ids, input_mask) start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
...@@ -205,6 +210,7 @@ class BertModelTest(unittest.TestCase): ...@@ -205,6 +210,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertForSequenceClassification(config=config, num_labels=self.num_labels) model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
model.eval()
loss = model(input_ids, token_type_ids, input_mask, sequence_labels) loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
logits = model(input_ids, token_type_ids, input_mask) logits = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
...@@ -221,6 +227,7 @@ class BertModelTest(unittest.TestCase): ...@@ -221,6 +227,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertForTokenClassification(config=config, num_labels=self.num_labels) model = BertForTokenClassification(config=config, num_labels=self.num_labels)
model.eval()
loss = model(input_ids, token_type_ids, input_mask, token_labels) loss = model(input_ids, token_type_ids, input_mask, token_labels)
logits = model(input_ids, token_type_ids, input_mask) logits = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
......
This diff is collapsed.
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import unittest
import json
from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
class OpenAIGPTTokenizationTest(unittest.TestCase):
def test_full_tokenizer(self):
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
"w</w>", "r</w>", "t</w>",
"lo", "low", "er</w>",
"low</w>", "lowest</w>", "newer</w>", "wider</w>"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
json.dump(vocab_tokens, fp)
vocab_file = fp.name
with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
fp.write("\n".join(merges))
merges_file = fp.name
tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["<unk>"])
os.remove(vocab_file)
os.remove(merges_file)
text = "lower"
bpe_tokens = ["low", "er</w>"]
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + ["<unk>"]
input_bpe_tokens = [14, 15, 20]
self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
if __name__ == '__main__':
unittest.main()
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment