Commit 93f563b8 authored by thomwolf's avatar thomwolf
Browse files

adding OpenAI GPT

parent 8da280eb
# coding=utf-8
# Copyright 2018 The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert BERT checkpoint."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import re
import argparse
import tensorflow as tf
import torch
import numpy as np
from .modeling import BertConfig, BertForPreTraining
def convert_openai_checkpoint_to_pytorch(open_checkpoint_folder_path, openai_config_file, pytorch_dump_path):
def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, n_embd=768, path='./model/',
path_names='./'):
# Load weights from TF model
print("Loading weights...")
names = json.load(open(path_names + 'parameters_names.json'))
shapes = json.load(open(path + 'params_shapes.json'))
offsets = np.cumsum([np.prod(shape) for shape in shapes])
init_params = [np.load(path + 'params_{}.npy'.format(n)) for n in range(10)]
init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
if n_ctx > 0:
init_params[0] = init_params[0][:n_ctx]
if n_special > 0:
init_params[0] = np.concatenate(
[init_params[1],
(np.random.randn(n_special, n_embd) * 0.02).astype(np.float32),
init_params[0]
], 0)
else:
init_params[0] = np.concatenate(
[init_params[1],
init_params[0]
], 0)
del init_params[1]
if n_transfer == -1:
n_transfer = 0
else:
n_transfer = 1 + n_transfer * 12
init_params = [arr.squeeze() for arr in init_params]
try:
assert model.embed.weight.shape == init_params[0].shape
except AssertionError as e:
e.args += (model.embed.weight.shape, init_params[0].shape)
raise
model.embed.weight.data = torch.from_numpy(init_params[0])
for name, ip in zip(names[1:n_transfer], init_params[1:n_transfer]):
name = name[6:] # skip "model/"
assert name[-2:] == ":0"
name = name[:-2]
name = name.split('/')
pointer = model
for m_name in name:
if re.fullmatch(r'[A-Za-z]+\d+', m_name):
l = re.split(r'(\d+)', m_name)
else:
l = [m_name]
pointer = getattr(pointer, l[0])
if len(l) >= 2:
num = int(l[1])
pointer = pointer[num]
try:
assert pointer.shape == ip.shape
except AssertionError as e:
e.args += (pointer.shape, ip.shape)
raise
pointer.data = torch.from_numpy(ip)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
config_path = os.path.abspath(bert_config_file)
tf_path = os.path.abspath(tf_checkpoint_path)
print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path))
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
# Initialise PyTorch model
config = BertConfig.from_json_file(bert_config_file)
print("Building PyTorch model from configuration: {}".format(str(config)))
model = BertForPreTraining(config)
for name, array in zip(names, arrays):
name = name.split('/')
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(n in ["adam_v", "adam_m"] for n in name):
print("Skipping {}".format("/".join(name)))
continue
pointer = model
for m_name in name:
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
l = re.split(r'_(\d+)', m_name)
else:
l = [m_name]
if l[0] == 'kernel' or l[0] == 'gamma':
pointer = getattr(pointer, 'weight')
elif l[0] == 'output_bias' or l[0] == 'beta':
pointer = getattr(pointer, 'bias')
elif l[0] == 'output_weights':
pointer = getattr(pointer, 'weight')
else:
pointer = getattr(pointer, l[0])
if len(l) >= 2:
num = int(l[1])
pointer = pointer[num]
if m_name[-11:] == '_embeddings':
pointer = getattr(pointer, 'weight')
elif m_name == 'kernel':
array = np.transpose(array)
try:
assert pointer.shape == array.shape
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
print("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array)
# Save pytorch-model
print("Save PyTorch model to {}".format(pytorch_dump_path))
torch.save(model.state_dict(), pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--tf_checkpoint_path",
default = None,
type = str,
required = True,
help = "Path the TensorFlow checkpoint path.")
parser.add_argument("--bert_config_file",
default = None,
type = str,
required = True,
help = "The config json file corresponding to the pre-trained BERT model. \n"
"This specifies the model architecture.")
parser.add_argument("--pytorch_dump_path",
default = None,
type = str,
required = True,
help = "Path to the output PyTorch model.")
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
args.bert_config_file,
args.pytorch_dump_path)
......@@ -416,12 +416,12 @@ class BertPreTrainingHeads(nn.Module):
return prediction_scores, seq_relationship_score
class PreTrainedBertModel(nn.Module):
class PreTrainedModel(nn.Module):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
def __init__(self, config, *inputs, **kwargs):
super(PreTrainedBertModel, self).__init__()
super(PreTrainedModel, self).__init__()
if not isinstance(config, BertConfig):
raise ValueError(
"Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
......@@ -447,7 +447,7 @@ class PreTrainedBertModel(nn.Module):
@classmethod
def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict.
Instantiate a PreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
Params:
......@@ -551,7 +551,7 @@ class PreTrainedBertModel(nn.Module):
return model
class BertModel(PreTrainedBertModel):
class BertModel(PreTrainedModel):
"""BERT model ("Bidirectional Embedding Representations from a Transformer").
Params:
......@@ -634,7 +634,7 @@ class BertModel(PreTrainedBertModel):
return encoded_layers, pooled_output
class BertForPreTraining(PreTrainedBertModel):
class BertForPreTraining(PreTrainedModel):
"""BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads:
- the masked language modeling head, and
......@@ -705,7 +705,7 @@ class BertForPreTraining(PreTrainedBertModel):
return prediction_scores, seq_relationship_score
class BertForMaskedLM(PreTrainedBertModel):
class BertForMaskedLM(PreTrainedModel):
"""BERT model with the masked language modeling head.
This module comprises the BERT model followed by the masked language modeling head.
......@@ -766,7 +766,7 @@ class BertForMaskedLM(PreTrainedBertModel):
return prediction_scores
class BertForNextSentencePrediction(PreTrainedBertModel):
class BertForNextSentencePrediction(PreTrainedModel):
"""BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence classification head.
......@@ -828,7 +828,7 @@ class BertForNextSentencePrediction(PreTrainedBertModel):
return seq_relationship_score
class BertForSequenceClassification(PreTrainedBertModel):
class BertForSequenceClassification(PreTrainedModel):
"""BERT model for classification.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
......@@ -894,7 +894,7 @@ class BertForSequenceClassification(PreTrainedBertModel):
return logits
class BertForMultipleChoice(PreTrainedBertModel):
class BertForMultipleChoice(PreTrainedModel):
"""BERT model for multiple choice tasks.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
......@@ -963,7 +963,7 @@ class BertForMultipleChoice(PreTrainedBertModel):
return reshaped_logits
class BertForTokenClassification(PreTrainedBertModel):
class BertForTokenClassification(PreTrainedModel):
"""BERT model for token-level classification.
This module is composed of the BERT model with a linear layer on top of
the full hidden state of the last layer.
......@@ -1029,7 +1029,7 @@ class BertForTokenClassification(PreTrainedBertModel):
return logits
class BertForQuestionAnswering(PreTrainedBertModel):
class BertForQuestionAnswering(PreTrainedModel):
"""BERT model for Question Answering (span extraction).
This module is composed of the BERT model with a linear layer on top of
the sequence output that computes start_logits and end_logits
......
import copy
import json
import math
import re
import collections
import numpy as np
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
from .modeling import BertLayerNorm as LayerNorm
def gelu(x):
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
def swish(x):
return x * torch.sigmoid(x)
ACT_FNS = {
'relu': nn.ReLU,
'swish': swish,
'gelu': gelu
}
class Conv1D(nn.Module):
def __init__(self, nf, rf, nx):
super(Conv1D, self).__init__()
self.rf = rf
self.nf = nf
if rf == 1: # faster 1x1 conv
w = torch.empty(nx, nf)
nn.init.normal_(w, std=0.02)
self.w = Parameter(w)
self.b = Parameter(torch.zeros(nf))
else: # was used to train LM
raise NotImplementedError
def forward(self, x):
if self.rf == 1:
size_out = x.size()[:-1] + (self.nf,)
x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w)
x = x.view(*size_out)
else:
raise NotImplementedError
return x
class Attention(nn.Module):
def __init__(self, nx, n_ctx, cfg, scale=False):
super(Attention, self).__init__()
n_state = nx # in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
assert n_state % cfg.n_head == 0
self.register_buffer('b', torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
self.n_head = cfg.n_head
self.split_size = n_state
self.scale = scale
self.c_attn = Conv1D(n_state * 3, 1, nx)
self.c_proj = Conv1D(n_state, 1, nx)
self.attn_dropout = nn.Dropout(cfg.attn_pdrop)
self.resid_dropout = nn.Dropout(cfg.resid_pdrop)
def _attn(self, q, k, v):
w = torch.matmul(q, k)
if self.scale:
w = w / math.sqrt(v.size(-1))
w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights
w = nn.Softmax(dim=-1)(w)
w = self.attn_dropout(w)
return torch.matmul(w, v)
def merge_heads(self, x):
x = x.permute(0, 2, 1, 3).contiguous()
new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states
def split_heads(self, x, k=False):
new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states
if k:
return x.permute(0, 2, 3, 1)
else:
return x.permute(0, 2, 1, 3)
def forward(self, x):
x = self.c_attn(x)
query, key, value = x.split(self.split_size, dim=2)
query = self.split_heads(query)
key = self.split_heads(key, k=True)
value = self.split_heads(value)
a = self._attn(query, key, value)
a = self.merge_heads(a)
a = self.c_proj(a)
a = self.resid_dropout(a)
return a
class MLP(nn.Module):
def __init__(self, n_state, cfg): # in MLP: n_state=3072 (4 * n_embd)
super(MLP, self).__init__()
nx = cfg.n_embd
self.c_fc = Conv1D(n_state, 1, nx)
self.c_proj = Conv1D(nx, 1, n_state)
self.act = ACT_FNS[cfg.afn]
self.dropout = nn.Dropout(cfg.resid_pdrop)
def forward(self, x):
h = self.act(self.c_fc(x))
h2 = self.c_proj(h)
return self.dropout(h2)
class Block(nn.Module):
def __init__(self, n_ctx, cfg, scale=False):
super(Block, self).__init__()
nx = cfg.n_embd
self.attn = Attention(nx, n_ctx, cfg, scale)
self.ln_1 = LayerNorm(nx)
self.mlp = MLP(4 * nx, cfg)
self.ln_2 = LayerNorm(nx)
def forward(self, x):
a = self.attn(x)
n = self.ln_1(x + a)
m = self.mlp(n)
h = self.ln_2(n + m)
return h
class TransformerModel(nn.Module):
""" Transformer model """
def __init__(self, cfg, vocab=40990, n_ctx=512):
super(TransformerModel, self).__init__()
self.vocab = vocab
self.embed = nn.Embedding(vocab, cfg.n_embd)
self.drop = nn.Dropout(cfg.embd_pdrop)
block = Block(n_ctx, cfg, scale=True)
self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(cfg.n_layer)])
nn.init.normal_(self.embed.weight, std=0.02)
def forward(self, x):
x = x.view(-1, x.size(-2), x.size(-1))
e = self.embed(x)
# Add the position information to the input embeddings
h = e.sum(dim=2)
for block in self.h:
h = block(h)
return h
class LMHead(nn.Module):
""" Language Model Head for the transformer """
def __init__(self, model, cfg):
super(LMHead, self).__init__()
self.n_embd = cfg.n_embd
embed_shape = model.embed.weight.shape
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
self.decoder.weight = model.embed.weight # Tied weights
def forward(self, h):
# Truncated Language modeling logits (we remove the last token)
h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
lm_logits = self.decoder(h_trunc)
return lm_logits
class MultipleChoiceHead(nn.Module):
""" Classifier Head for the transformer """
def __init__(self, clf_token, cfg):
super(MultipleChoiceHead, self).__init__()
self.n_embd = cfg.n_embd
self.clf_token = clf_token
self.dropout = nn.Dropout2d(cfg.clf_pdrop) # To reproduce the noise_shape parameter of TF implementation
self.linear = nn.Linear(cfg.n_embd, 1)
nn.init.normal_(self.linear.weight, std = 0.02)
nn.init.normal_(self.linear.bias, 0)
def forward(self, h, x):
# Classification logits
clf_h = h.view(-1, self.n_embd)
flat = x[..., 0].contiguous().view(-1)
clf_h = clf_h[flat == self.clf_token, :]
clf_h = clf_h.view(-1, x.size(1), self.n_embd, 1)
# This double transposition is there to replicate the behavior
# of the noise_shape argument in the tensorflow
# implementation. For more details, see
# https://github.com/huggingface/pytorch-openai-transformer-lm/issues/11
clf_h = self.dropout(clf_h.transpose(1, 2)).transpose(1, 2)
clf_h = clf_h.contiguous().view(-1, self.n_embd)
clf_logits = self.linear(clf_h)
return clf_logits.view(-1, x.size(1))
class ClfHead(nn.Module):
"""Classification Head for the transformer
TODO: test this class."""
def __init__(self, clf_token, cfg, n_class):
super(ClfHead, self).__init__()
self.n_embd = cfg.n_embd
self.clf_token = clf_token
self.dropout = nn.Dropout(cfg.clf_pdrop)
self.linear = nn.Linear(cfg.n_embd, n_class)
nn.init.normal_(self.linear.weight, std = 0.02)
nn.init.normal_(self.linear.bias, 0)
def forward(self, h, x):
clf_h = h.view(-1, self.n_embd)
flat = x[..., 0].contiguous().view(-1)
clf_h = clf_h[flat == self.clf_token, :]
clf_h = self.dropout(clf_h)
clf_logits = self.linear(clf_h)
return clf_logits
class SimilarityHead(nn.Module):
""" Similarity Head for the transformer
TODO: test this class."""
def __init__(self, clf_token, cfg):
super(SimilarityHead, self).__init__()
self.n_embd = cfg.n_embd
self.clf_token = clf_token
self.dropout = nn.Dropout(cfg.clf_pdrop)
self.linear = nn.Linear(cfg.n_embd, 1)
nn.init.normal_(self.linear.weight, std = 0.02)
nn.init.normal_(self.linear.bias, 0)
def forward(self, h, x):
sim_h = h.view(-1, self.n_embd)
flat = x[..., 0].contiguous().view(-1)
sim_h = sim_h[flat == self.clf_token, :]
sim_h = self.dropout(sim_h)
sim_h = sim_h.sum(dim = 1)
sim_logits = self.linear(sim_h)
return sim_logits
class DoubleHeadModel(nn.Module):
""" Transformer with language model and task specific heads """
def __init__(self, cfg, clf_token, task_head_type, vocab=40990, n_ctx=512):
super(DoubleHeadModel, self).__init__()
self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx)
self.lm_head = LMHead(self.transformer, cfg)
if isinstance(task_head_type, str):
if task_head_type == 'multiple_choice':
self.task_head = MultipleChoiceHead(clf_token, cfg)
elif task_head_type == 'similarity':
self.task_head = SimilarityHead(clf_token, cfg)
elif task_head_type == 'inference':
# the three classes correspond to entailment, contradiction and neutral.
self.task_head = ClfHead(clf_token, cfg, 3)
else:
raise ValueError("task_head_type is expected to be 'multiple_choice' "
"'similarity', 'inference' or ('classification', n_class) "
f"got {task_head_type}.")
elif isinstance(task_head_type, collections.abc.Sequence) and len(task_head_type) == 2 and \
task_head_type[0] == 'classification':
n_class = task_head_type[1]
self.task_head = ClfHead(clf_token, cfg, n_class)
else:
raise ValueError("task_head_type is expected to be 'multiple_choice' "
"'similarity', 'inference' or ('classification', n_class) "
f"got {task_head_type}.")
def forward(self, x):
h = self.transformer(x)
lm_logits = self.lm_head(h)
task_logits = self.task_head(h, x)
return lm_logits, task_logits
class dotdict(dict):
"""dot.notation access to dictionary attributes"""
__getattr__ = dict.get
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
DEFAULT_CONFIG = dotdict({
'n_embd': 768,
'n_head': 12,
'n_layer': 12,
'embd_pdrop': 0.1,
'attn_pdrop': 0.1,
'resid_pdrop': 0.1,
'afn': 'gelu',
'clf_pdrop': 0.1})
import math
import torch
from torch.optim import Optimizer
from torch.nn.utils import clip_grad_norm_
def warmup_cosine(x, warmup=0.002):
s = 1 if x <= warmup else 0
return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x)))
def warmup_constant(x, warmup=0.002):
s = 1 if x <= warmup else 0
return s*(x/warmup) + (1-s)*1
def warmup_linear(x, warmup=0.002):
s = 1 if x <= warmup else 0
return (s*(x/warmup) + (1-s))*(1-x)
SCHEDULES = {
'warmup_cosine':warmup_cosine,
'warmup_constant':warmup_constant,
'warmup_linear':warmup_linear,
}
class OpenAIAdam(Optimizer):
"""Implements Open AI version of Adam algorithm with weight decay fix.
"""
def __init__(self, params, lr, schedule, warmup, t_total,
b1=0.9, b2=0.999, e=1e-8, l2=0,
vector_l2=False, max_grad_norm=-1, **kwargs):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if schedule not in SCHEDULES:
raise ValueError("Invalid schedule parameter: {}".format(schedule))
if not 0 <= warmup:
raise ValueError("Invalid warmup: {}".format(warmup))
if not 0.0 <= b1 < 1.0:
raise ValueError("Invalid b1 parameter: {}".format(b1))
if not 0.0 <= b2 < 1.0:
raise ValueError("Invalid b2 parameter: {}".format(b2))
if not 0.0 <= e:
raise ValueError("Invalid epsilon value: {}".format(e))
defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
b1=b1, b2=b2, e=e, l2=l2, vector_l2=vector_l2,
max_grad_norm=max_grad_norm)
super(OpenAIAdam, self).__init__(params, defaults)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['b1'], group['b2']
state['step'] += 1
# Add grad clipping
if group['max_grad_norm'] > 0:
clip_grad_norm_(p, group['max_grad_norm'])
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
denom = exp_avg_sq.sqrt().add_(group['e'])
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
schedule_fct = SCHEDULES[group['schedule']]
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
p.data.addcdiv_(-step_size, exp_avg, denom)
# Add weight decay at the end (fixed version)
if (len(p.size()) > 1 or group['vector_l2']) and group['l2'] > 0:
p.data.add_(-lr_scheduled * group['l2'], p.data)
return loss
import re
import ftfy
import json
import spacy
from tqdm import tqdm
def get_pairs(word):
"""
Return set of symbol pairs in a word.
word is represented as tuple of symbols (symbols being variable-length strings)
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
def text_standardize(text):
"""
fixes some issues the spacy tokenizer had on books corpus
also does some whitespace standardization
"""
text = text.replace('—', '-')
text = text.replace('–', '-')
text = text.replace('―', '-')
text = text.replace('…', '...')
text = text.replace('´', "'")
text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
text = re.sub(r'\s*\n\s*', ' \n ', text)
text = re.sub(r'[^\S\n]+', ' ', text)
return text.strip()
class TextEncoder(object):
"""
mostly a wrapper for a public python bpe tokenizer
"""
def __init__(self, encoder_path, bpe_path):
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
self.encoder = json.load(open(encoder_path))
self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(bpe_path, encoding='utf-8').read().split('\n')[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
def bpe(self, token):
word = tuple(token[:-1]) + ( token[-1] + '</w>',)
if token in self.cache:
return self.cache[token]
pairs = get_pairs(word)
if not pairs:
return token+'</w>'
while True:
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word)-1 and word[i+1] == second:
new_word.append(first+second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
if word == '\n </w>':
word = '\n</w>'
self.cache[token] = word
return word
def encode(self, texts, verbose=True):
texts_tokens = []
if verbose:
for text in tqdm(texts, ncols=80, leave=False):
text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens = []
for token in text:
text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
texts_tokens.append(text_tokens)
else:
for text in texts:
text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens = []
for token in text:
text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
texts_tokens.append(text_tokens)
return texts_tokens
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment