Merge branch 'master' into from_scratch_training

715fa638 · Julien Chaumond · 764f836d · 100e3b6f · 715fa638 · 715fa638
Commit 715fa638 authored Jan 14, 2020 by Julien Chaumond
7 changed files
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -384,16 +384,13 @@ def get_from_cache(
                else:
                    http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
-                # we are copying the file before closing it, so flush to avoid truncation
+            logger.info("storing %s in cache at %s", url, cache_path)
-                temp_file.flush()
+            os.rename(temp_file.name, cache_path)
-                logger.info("storing %s in cache at %s", url, cache_path)
+            logger.info("creating metadata file for %s", cache_path)
-                os.rename(temp_file.name, cache_path)
+            meta = {"url": url, "etag": etag}
+            meta_path = cache_path + ".json"
-                logger.info("creating metadata file for %s", cache_path)
+            with open(meta_path, "w") as meta_file:
-                meta = {"url": url, "etag": etag}
+                json.dump(meta, meta_file)
-                meta_path = cache_path + ".json"
-                with open(meta_path, "w") as meta_file:
-                    json.dump(meta, meta_file)
    return cache_path
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -579,6 +579,9 @@ class AlbertMLMHead(nn.Module):
        self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
        self.activation = ACT2FN[config.hidden_act]
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.activation(hidden_states)

--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -481,6 +481,9 @@ class BertLMPredictionHead(nn.Module):
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states) + self.bias

--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -306,6 +306,9 @@ class RobertaLMHead(nn.Module):
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
    def forward(self, features, **kwargs):
        x = self.dense(features)
        x = gelu(x)

--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -286,6 +286,7 @@ class T5Attention(nn.Module):
            bidirectional=not self.is_decoder,
            num_buckets=self.relative_attention_num_buckets,
        )
+        rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device)
        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, qlen, klen)
        return values

--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -705,55 +705,71 @@ class QuestionAnsweringPipeline(Pipeline):
        # Convert inputs to features
        examples = self._args_parser(*texts, **kwargs)
-        features = squad_convert_examples_to_features(
+        features_list = [
-            examples, self.tokenizer, kwargs["max_seq_len"], kwargs["doc_stride"], kwargs["max_question_len"], False
+            squad_convert_examples_to_features(
-        )
+                [example],
-        fw_args = self.inputs_for_model([f.__dict__ for f in features])
+                self.tokenizer,
+                kwargs["max_seq_len"],
+                kwargs["doc_stride"],
+                kwargs["max_question_len"],
+                False,
+            )
+            for example in examples
+        ]
+        all_answers = []
+        for features, example in zip(features_list, examples):
+            fw_args = self.inputs_for_model([f.__dict__ for f in features])
-        # Manage tensor allocation on correct device
+            # Manage tensor allocation on correct device
-        with self.device_placement():
+            with self.device_placement():
-            if self.framework == "tf":
+                if self.framework == "tf":
-                fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
+                    fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
-                start, end = self.model(fw_args)
+                    start, end = self.model(fw_args)
-                start, end = start.numpy(), end.numpy()
+                    start, end = start.numpy(), end.numpy()
-            else:
+                else:
-                with torch.no_grad():
+                    with torch.no_grad():
-                    # Retrieve the score for the context tokens only (removing question tokens)
+                        # Retrieve the score for the context tokens only (removing question tokens)
-                    fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
+                        fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
-                    start, end = self.model(**fw_args)
+                        start, end = self.model(**fw_args)
-                    start, end = start.cpu().numpy(), end.cpu().numpy()
+                        start, end = start.cpu().numpy(), end.cpu().numpy()
-        answers = []
+            answers = []
-        for (example, feature, start_, end_) in zip(examples, features, start, end):
+            for (feature, start_, end_) in zip(features, start, end):
-            # Normalize logits and spans to retrieve the answer
+                # Normalize logits and spans to retrieve the answer
-            start_ = np.exp(start_) / np.sum(np.exp(start_))
+                start_ = np.exp(start_) / np.sum(np.exp(start_))
-            end_ = np.exp(end_) / np.sum(np.exp(end_))
+                end_ = np.exp(end_) / np.sum(np.exp(end_))
-            # Mask padding and question
+                # Mask padding and question
-            start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1)
+                start_, end_ = (
+                    start_ * np.abs(np.array(feature.p_mask) - 1),
-            # TODO : What happens if not possible
+                    end_ * np.abs(np.array(feature.p_mask) - 1),
-            # Mask CLS
+                )
-            start_[0] = end_[0] = 0
+                # TODO : What happens if not possible
-            starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
+                # Mask CLS
-            char_to_word = np.array(example.char_to_word_offset)
+                start_[0] = end_[0] = 0
-            # Convert the answer (tokens) back to the original text
+                starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
-            answers += [
+                char_to_word = np.array(example.char_to_word_offset)
-                {
-                    "score": score.item(),
+                # Convert the answer (tokens) back to the original text
-                    "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
+                answers += [
-                    "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
+                    {
-                    "answer": " ".join(
+                        "score": score.item(),
-                        example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
+                        "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
-                    ),
+                        "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
-                }
+                        "answer": " ".join(
-                for s, e, score in zip(starts, ends, scores)
+                            example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
-            ]
+                        ),
-        if len(answers) == 1:
+                    }
-            return answers[0]
+                    for s, e, score in zip(starts, ends, scores)
-        return answers
+                ]
+            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]]
+            all_answers += answers
+        if len(all_answers) == 1:
+            return all_answers[0]
+        return all_answers
    def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
        """

--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -485,6 +485,8 @@ class ModelTesterMixin:
            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**inputs_dict)
            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
@@ -492,6 +494,11 @@ class ModelTesterMixin:
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**inputs_dict)
            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
            models_equal = True
            for p1, p2 in zip(cloned_embeddings, model_embed.weight):