Commit 715fa638 authored by Julien Chaumond's avatar Julien Chaumond
Browse files

Merge branch 'master' into from_scratch_training

parents 764f836d 100e3b6f
...@@ -384,16 +384,13 @@ def get_from_cache( ...@@ -384,16 +384,13 @@ def get_from_cache(
else: else:
http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent) http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
# we are copying the file before closing it, so flush to avoid truncation logger.info("storing %s in cache at %s", url, cache_path)
temp_file.flush() os.rename(temp_file.name, cache_path)
logger.info("storing %s in cache at %s", url, cache_path) logger.info("creating metadata file for %s", cache_path)
os.rename(temp_file.name, cache_path) meta = {"url": url, "etag": etag}
meta_path = cache_path + ".json"
logger.info("creating metadata file for %s", cache_path) with open(meta_path, "w") as meta_file:
meta = {"url": url, "etag": etag} json.dump(meta, meta_file)
meta_path = cache_path + ".json"
with open(meta_path, "w") as meta_file:
json.dump(meta, meta_file)
return cache_path return cache_path
...@@ -579,6 +579,9 @@ class AlbertMLMHead(nn.Module): ...@@ -579,6 +579,9 @@ class AlbertMLMHead(nn.Module):
self.decoder = nn.Linear(config.embedding_size, config.vocab_size) self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
self.activation = ACT2FN[config.hidden_act] self.activation = ACT2FN[config.hidden_act]
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.dense(hidden_states) hidden_states = self.dense(hidden_states)
hidden_states = self.activation(hidden_states) hidden_states = self.activation(hidden_states)
......
...@@ -481,6 +481,9 @@ class BertLMPredictionHead(nn.Module): ...@@ -481,6 +481,9 @@ class BertLMPredictionHead(nn.Module):
self.bias = nn.Parameter(torch.zeros(config.vocab_size)) self.bias = nn.Parameter(torch.zeros(config.vocab_size))
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states) + self.bias hidden_states = self.decoder(hidden_states) + self.bias
......
...@@ -306,6 +306,9 @@ class RobertaLMHead(nn.Module): ...@@ -306,6 +306,9 @@ class RobertaLMHead(nn.Module):
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size)) self.bias = nn.Parameter(torch.zeros(config.vocab_size))
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias
def forward(self, features, **kwargs): def forward(self, features, **kwargs):
x = self.dense(features) x = self.dense(features)
x = gelu(x) x = gelu(x)
......
...@@ -286,6 +286,7 @@ class T5Attention(nn.Module): ...@@ -286,6 +286,7 @@ class T5Attention(nn.Module):
bidirectional=not self.is_decoder, bidirectional=not self.is_decoder,
num_buckets=self.relative_attention_num_buckets, num_buckets=self.relative_attention_num_buckets,
) )
rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device)
values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads)
values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen) values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen)
return values return values
......
...@@ -705,55 +705,71 @@ class QuestionAnsweringPipeline(Pipeline): ...@@ -705,55 +705,71 @@ class QuestionAnsweringPipeline(Pipeline):
# Convert inputs to features # Convert inputs to features
examples = self._args_parser(*texts, **kwargs) examples = self._args_parser(*texts, **kwargs)
features = squad_convert_examples_to_features( features_list = [
examples, self.tokenizer, kwargs["max_seq_len"], kwargs["doc_stride"], kwargs["max_question_len"], False squad_convert_examples_to_features(
) [example],
fw_args = self.inputs_for_model([f.__dict__ for f in features]) self.tokenizer,
kwargs["max_seq_len"],
kwargs["doc_stride"],
kwargs["max_question_len"],
False,
)
for example in examples
]
all_answers = []
for features, example in zip(features_list, examples):
fw_args = self.inputs_for_model([f.__dict__ for f in features])
# Manage tensor allocation on correct device # Manage tensor allocation on correct device
with self.device_placement(): with self.device_placement():
if self.framework == "tf": if self.framework == "tf":
fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
start, end = self.model(fw_args) start, end = self.model(fw_args)
start, end = start.numpy(), end.numpy() start, end = start.numpy(), end.numpy()
else: else:
with torch.no_grad(): with torch.no_grad():
# Retrieve the score for the context tokens only (removing question tokens) # Retrieve the score for the context tokens only (removing question tokens)
fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()} fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
start, end = self.model(**fw_args) start, end = self.model(**fw_args)
start, end = start.cpu().numpy(), end.cpu().numpy() start, end = start.cpu().numpy(), end.cpu().numpy()
answers = [] answers = []
for (example, feature, start_, end_) in zip(examples, features, start, end): for (feature, start_, end_) in zip(features, start, end):
# Normalize logits and spans to retrieve the answer # Normalize logits and spans to retrieve the answer
start_ = np.exp(start_) / np.sum(np.exp(start_)) start_ = np.exp(start_) / np.sum(np.exp(start_))
end_ = np.exp(end_) / np.sum(np.exp(end_)) end_ = np.exp(end_) / np.sum(np.exp(end_))
# Mask padding and question # Mask padding and question
start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1) start_, end_ = (
start_ * np.abs(np.array(feature.p_mask) - 1),
# TODO : What happens if not possible end_ * np.abs(np.array(feature.p_mask) - 1),
# Mask CLS )
start_[0] = end_[0] = 0
# TODO : What happens if not possible
starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"]) # Mask CLS
char_to_word = np.array(example.char_to_word_offset) start_[0] = end_[0] = 0
# Convert the answer (tokens) back to the original text starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
answers += [ char_to_word = np.array(example.char_to_word_offset)
{
"score": score.item(), # Convert the answer (tokens) back to the original text
"start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), answers += [
"end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), {
"answer": " ".join( "score": score.item(),
example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1] "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
), "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
} "answer": " ".join(
for s, e, score in zip(starts, ends, scores) example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
] ),
if len(answers) == 1: }
return answers[0] for s, e, score in zip(starts, ends, scores)
return answers ]
answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]]
all_answers += answers
if len(all_answers) == 1:
return all_answers[0]
return all_answers
def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple: def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
""" """
......
...@@ -485,6 +485,8 @@ class ModelTesterMixin: ...@@ -485,6 +485,8 @@ class ModelTesterMixin:
self.assertEqual(model.config.vocab_size, model_vocab_size + 10) self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
# Check that it actually resizes the embeddings matrix # Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**inputs_dict)
# Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
model_embed = model.resize_token_embeddings(model_vocab_size - 15) model_embed = model.resize_token_embeddings(model_vocab_size - 15)
...@@ -492,6 +494,11 @@ class ModelTesterMixin: ...@@ -492,6 +494,11 @@ class ModelTesterMixin:
# Check that it actually resizes the embeddings matrix # Check that it actually resizes the embeddings matrix
self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
# Check that the model can still do a forward pass successfully (every parameter should be resized)
# Input ids should be clamped to the maximum size of the vocabulary
inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
model(**inputs_dict)
# Check that adding and removing tokens has not modified the first part of the embedding matrix. # Check that adding and removing tokens has not modified the first part of the embedding matrix.
models_equal = True models_equal = True
for p1, p2 in zip(cloned_embeddings, model_embed.weight): for p1, p2 in zip(cloned_embeddings, model_embed.weight):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment