Merge branch 'master' into from_scratch_training

715fa638 · Julien Chaumond · 764f836d · 100e3b6f · 715fa638 · 715fa638
Commit 715fa638 authored Jan 14, 2020 by Julien Chaumond
7 changed files
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -384,9 +384,6 @@ def get_from_cache(
                else:
                    http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)

-                # we are copying the file before closing it, so flush to avoid truncation
-                temp_file.flush()
-
            logger.info("storing %s in cache at %s", url, cache_path)
            os.rename(temp_file.name, cache_path)


--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -579,6 +579,9 @@ class AlbertMLMHead(nn.Module):
        self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
        self.activation = ACT2FN[config.hidden_act]

+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.activation(hidden_states)

--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -481,6 +481,9 @@ class BertLMPredictionHead(nn.Module):

        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states) + self.bias

--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -306,6 +306,9 @@ class RobertaLMHead(nn.Module):
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
    def forward(self, features, **kwargs):
        x = self.dense(features)
        x = gelu(x)

--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -286,6 +286,7 @@ class T5Attention(nn.Module):
            bidirectional=not self.is_decoder,
            num_buckets=self.relative_attention_num_buckets,
        )
+        rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device)
        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, qlen, klen)
        return values

--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -705,9 +705,19 @@ class QuestionAnsweringPipeline(Pipeline):

        # Convert inputs to features
        examples = self._args_parser(*texts, **kwargs)
-        features = squad_convert_examples_to_features(
-            examples, self.tokenizer, kwargs["max_seq_len"], kwargs["doc_stride"], kwargs["max_question_len"], False
+        features_list = [
+            squad_convert_examples_to_features(
+                [example],
+                self.tokenizer,
+                kwargs["max_seq_len"],
+                kwargs["doc_stride"],
+                kwargs["max_question_len"],
+                False,
            )
+            for example in examples
+        ]
+        all_answers = []
+        for features, example in zip(features_list, examples):
            fw_args = self.inputs_for_model([f.__dict__ for f in features])

            # Manage tensor allocation on correct device
@@ -724,13 +734,16 @@ class QuestionAnsweringPipeline(Pipeline):
                        start, end = start.cpu().numpy(), end.cpu().numpy()

            answers = []
-        for (example, feature, start_, end_) in zip(examples, features, start, end):
+            for (feature, start_, end_) in zip(features, start, end):
                # Normalize logits and spans to retrieve the answer
                start_ = np.exp(start_) / np.sum(np.exp(start_))
                end_ = np.exp(end_) / np.sum(np.exp(end_))

                # Mask padding and question
-            start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1)
+                start_, end_ = (
+                    start_ * np.abs(np.array(feature.p_mask) - 1),
+                    end_ * np.abs(np.array(feature.p_mask) - 1),
+                )

                # TODO : What happens if not possible
                # Mask CLS
@@ -751,9 +764,12 @@ class QuestionAnsweringPipeline(Pipeline):
                    }
                    for s, e, score in zip(starts, ends, scores)
                ]
-        if len(answers) == 1:
-            return answers[0]
-        return answers
+            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]]
+            all_answers += answers
+
+        if len(all_answers) == 1:
+            return all_answers[0]
+        return all_answers

    def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
        """

--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -485,6 +485,8 @@ class ModelTesterMixin:
            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**inputs_dict)

            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
@@ -492,6 +494,11 @@ class ModelTesterMixin:
            # Check that it actually resizes the embeddings matrix
            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)

+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**inputs_dict)
+
            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
            models_equal = True
            for p1, p2 in zip(cloned_embeddings, model_embed.weight):