Commit a98b2ca8 authored by Julien Chaumond's avatar Julien Chaumond
Browse files

Style + fixup BertJapaneseTokenizer

parent 83a41d39
...@@ -78,9 +78,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings): ...@@ -78,9 +78,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
else: else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
return super()._embedding( return super()._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
[input_ids, position_ids, token_type_ids, inputs_embeds], training=training
)
class TFRobertaMainLayer(TFBertMainLayer): class TFRobertaMainLayer(TFBertMainLayer):
......
...@@ -107,7 +107,7 @@ class BertJapaneseTokenizer(BertTokenizer): ...@@ -107,7 +107,7 @@ class BertJapaneseTokenizer(BertTokenizer):
**subword_tokenizer_type**: (`optional`) string (default "wordpiece") **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
Type of subword tokenizer. Type of subword tokenizer.
""" """
super().__init__( super(BertTokenizer, self).__init__(
unk_token=unk_token, unk_token=unk_token,
sep_token=sep_token, sep_token=sep_token,
pad_token=pad_token, pad_token=pad_token,
...@@ -115,6 +115,7 @@ class BertJapaneseTokenizer(BertTokenizer): ...@@ -115,6 +115,7 @@ class BertJapaneseTokenizer(BertTokenizer):
mask_token=mask_token, mask_token=mask_token,
**kwargs, **kwargs,
) )
# ^^ We call the grandparent's init, not the parent's.
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
......
...@@ -268,9 +268,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast): ...@@ -268,9 +268,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
truncation_strategy="longest_first", truncation_strategy="longest_first",
**kwargs **kwargs
): ):
super().__init__( super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
)
self._tokenizer = tk.Tokenizer(tk.models.BPE.from_files(vocab_file, merges_file)) self._tokenizer = tk.Tokenizer(tk.models.BPE.from_files(vocab_file, merges_file))
self._update_special_tokens() self._update_special_tokens()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment