Unverified Commit a36f981d authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge branch 'master' into fix-ctrl-past

parents 151e4ab4 5afca00b
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Tokenization classes for OpenAI GPT.""" """Tokenization classes for XLM."""
from __future__ import (absolute_import, division, print_function, from __future__ import (absolute_import, division, print_function,
unicode_literals) unicode_literals)
...@@ -758,9 +758,9 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -758,9 +758,9 @@ class XLMTokenizer(PreTrainedTokenizer):
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
A RoBERTa sequence has the following format: A XLM sequence has the following format:
single sequence: <s> X </s> single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s> pair of sequences: <s> A </s> B </s>
""" """
if token_ids_1 is None: if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
...@@ -781,7 +781,7 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -781,7 +781,7 @@ class XLMTokenizer(PreTrainedTokenizer):
special tokens for the model special tokens for the model
Returns: Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
......
...@@ -185,9 +185,9 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -185,9 +185,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
A RoBERTa sequence has the following format: An XLNet sequence has the following format:
single sequence: <s> X </s> single sequence: X <sep> <cls>
pair of sequences: <s> A </s></s> B </s> pair of sequences: A <sep> B <sep> <cls>
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
...@@ -208,7 +208,7 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -208,7 +208,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
special tokens for the model special tokens for the model
Returns: Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
...@@ -224,7 +224,7 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -224,7 +224,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format: An XLNet sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
| first sequence | second sequence | CLS segment ID | first sequence | second sequence | CLS segment ID
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment