"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "9a498c37a211daa8209921748ca586a99d2559ee"
Unverified Commit 03cdb2a3 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #254 from huggingface/python_2

Adding OpenAI GPT and Transformer-XL models, compatibility with Python 2
parents 2dfaf2f2 1e71f11d
...@@ -14,14 +14,13 @@ ...@@ -14,14 +14,13 @@
# limitations under the License. # limitations under the License.
"""Tokenization classes.""" """Tokenization classes."""
from __future__ import absolute_import from __future__ import absolute_import, division, print_function, unicode_literals
from __future__ import division
from __future__ import print_function
import collections import collections
import unicodedata
import os
import logging import logging
import os
import unicodedata
from io import open
from .file_utils import cached_path from .file_utils import cached_path
...@@ -117,26 +116,26 @@ class BertTokenizer(object): ...@@ -117,26 +116,26 @@ class BertTokenizer(object):
return tokens return tokens
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
""" """
Instantiate a PreTrainedBertModel from a pre-trained model file. Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed. Download and cache the pre-trained model file if needed.
""" """
if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP: if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name] vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
else: else:
vocab_file = pretrained_model_name vocab_file = pretrained_model_name_or_path
if os.path.isdir(vocab_file): if os.path.isdir(vocab_file):
vocab_file = os.path.join(vocab_file, VOCAB_NAME) vocab_file = os.path.join(vocab_file, VOCAB_NAME)
# redirect to the cache, if necessary # redirect to the cache, if necessary
try: try:
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
except FileNotFoundError: except EnvironmentError:
logger.error( logger.error(
"Model name '{}' was not found in model name list ({}). " "Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file " "We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format( "associated to this path or url.".format(
pretrained_model_name, pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
vocab_file)) vocab_file))
return None return None
...@@ -145,10 +144,10 @@ class BertTokenizer(object): ...@@ -145,10 +144,10 @@ class BertTokenizer(object):
else: else:
logger.info("loading vocabulary file {} from cache at {}".format( logger.info("loading vocabulary file {} from cache at {}".format(
vocab_file, resolved_vocab_file)) vocab_file, resolved_vocab_file))
if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings # than the number of positional embeddings
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name] max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
# Instantiate tokenizer. # Instantiate tokenizer.
tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
......
This diff is collapsed.
This diff is collapsed.
...@@ -33,12 +33,13 @@ To create the package for pypi. ...@@ -33,12 +33,13 @@ To create the package for pypi.
7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory. 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
""" """
from io import open
from setuptools import find_packages, setup from setuptools import find_packages, setup
setup( setup(
name="pytorch_pretrained_bert", name="pytorch_pretrained_bert",
version="0.4.0", version="0.5.0",
author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors", author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
author_email="thomas@huggingface.co", author_email="thomas@huggingface.co",
description="PyTorch version of Google AI BERT model with script to load Google pre-trained models", description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
long_description=open("README.md", "r", encoding='utf-8').read(), long_description=open("README.md", "r", encoding='utf-8').read(),
...@@ -55,10 +56,10 @@ setup( ...@@ -55,10 +56,10 @@ setup(
'tqdm'], 'tqdm'],
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
"pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main" "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
] ]
}, },
python_requires='>=3.5.0', # python_requires='>=3.5.0',
tests_require=['pytest'], tests_require=['pytest'],
classifiers=[ classifiers=[
'Intended Audience :: Science/Research', 'Intended Audience :: Science/Research',
......
This diff is collapsed.
...@@ -114,6 +114,7 @@ class BertModelTest(unittest.TestCase): ...@@ -114,6 +114,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertModel(config=config) model = BertModel(config=config)
model.eval()
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
"sequence_output": all_encoder_layers[-1], "sequence_output": all_encoder_layers[-1],
...@@ -134,6 +135,7 @@ class BertModelTest(unittest.TestCase): ...@@ -134,6 +135,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertForMaskedLM(config=config) model = BertForMaskedLM(config=config)
model.eval()
loss = model(input_ids, token_type_ids, input_mask, token_labels) loss = model(input_ids, token_type_ids, input_mask, token_labels)
prediction_scores = model(input_ids, token_type_ids, input_mask) prediction_scores = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
...@@ -149,6 +151,7 @@ class BertModelTest(unittest.TestCase): ...@@ -149,6 +151,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertForNextSentencePrediction(config=config) model = BertForNextSentencePrediction(config=config)
model.eval()
loss = model(input_ids, token_type_ids, input_mask, sequence_labels) loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
seq_relationship_score = model(input_ids, token_type_ids, input_mask) seq_relationship_score = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
...@@ -165,6 +168,7 @@ class BertModelTest(unittest.TestCase): ...@@ -165,6 +168,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertForPreTraining(config=config) model = BertForPreTraining(config=config)
model.eval()
loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels) loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask) prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
...@@ -185,6 +189,7 @@ class BertModelTest(unittest.TestCase): ...@@ -185,6 +189,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertForQuestionAnswering(config=config) model = BertForQuestionAnswering(config=config)
model.eval()
loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels) loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
start_logits, end_logits = model(input_ids, token_type_ids, input_mask) start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
...@@ -205,6 +210,7 @@ class BertModelTest(unittest.TestCase): ...@@ -205,6 +210,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertForSequenceClassification(config=config, num_labels=self.num_labels) model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
model.eval()
loss = model(input_ids, token_type_ids, input_mask, sequence_labels) loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
logits = model(input_ids, token_type_ids, input_mask) logits = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
...@@ -221,6 +227,7 @@ class BertModelTest(unittest.TestCase): ...@@ -221,6 +227,7 @@ class BertModelTest(unittest.TestCase):
def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
model = BertForTokenClassification(config=config, num_labels=self.num_labels) model = BertForTokenClassification(config=config, num_labels=self.num_labels)
model.eval()
loss = model(input_ids, token_type_ids, input_mask, token_labels) loss = model(input_ids, token_type_ids, input_mask, token_labels)
logits = model(input_ids, token_type_ids, input_mask) logits = model(input_ids, token_type_ids, input_mask)
outputs = { outputs = {
......
This diff is collapsed.
This diff is collapsed.
...@@ -12,15 +12,17 @@ ...@@ -12,15 +12,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import absolute_import from __future__ import absolute_import, division, print_function, unicode_literals
from __future__ import division
from __future__ import print_function
import os import os
import unittest import unittest
from io import open
from pytorch_pretrained_bert.tokenization import (BertTokenizer, BasicTokenizer, WordpieceTokenizer, from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
_is_whitespace, _is_control, _is_punctuation) BertTokenizer,
WordpieceTokenizer,
_is_control, _is_punctuation,
_is_whitespace)
class TokenizationTest(unittest.TestCase): class TokenizationTest(unittest.TestCase):
...@@ -30,7 +32,7 @@ class TokenizationTest(unittest.TestCase): ...@@ -30,7 +32,7 @@ class TokenizationTest(unittest.TestCase):
"[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
"##ing", "," "##ing", ","
] ]
with open("/tmp/bert_tokenizer_test.txt", "w") as vocab_writer: with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
vocab_file = vocab_writer.name vocab_file = vocab_writer.name
...@@ -49,7 +51,7 @@ class TokenizationTest(unittest.TestCase): ...@@ -49,7 +51,7 @@ class TokenizationTest(unittest.TestCase):
"[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
"##ing", "," "##ing", ","
] ]
with open("/tmp/bert_tokenizer_test.txt", "w") as vocab_writer: with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
vocab_file = vocab_writer.name vocab_file = vocab_writer.name
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment