[BugFix] Fix extract_features method for WavLM models (#3350)

Summary: resolve https://github.com/pytorch/audio/issues/3347 `position_bias` is ignored in `extract_features` method, this doesn't affect Wav2Vec2 or HuBERT models, but it changes the output of transformer layers (except the first layer) in WavLM model. This PR fixes it by adding `position_bias` to the method. Pull Request resolved: https://github.com/pytorch/audio/pull/3350 Reviewed By: mthrok Differential Revision: D46112148 Pulled By: nateanl fbshipit-source-id: 3d21aa4b32b22da437b440097fd9b00238152596

[BugFix] Fix extract_features method for WavLM models (#3350)
Summary: resolve https://github.com/pytorch/audio/issues/3347 `position_bias` is ignored in `extract_features` method, this doesn't affect Wav2Vec2 or HuBERT models, but it changes the output of transformer layers (except the first layer) in WavLM model. This PR fixes it by adding `position_bias` to the method. Pull Request resolved: https://github.com/pytorch/audio/pull/3350 Reviewed By: mthrok Differential Revision: D46112148 Pulled By: nateanl fbshipit-source-id: 3d21aa4b32b22da437b440097fd9b00238152596
7d0f3369 · Zhaoheng Ni · Facebook GitHub Bot · fce54fd1 · 7d0f3369 · 7d0f3369
Commit 7d0f3369 authored May 23, 2023 by Zhaoheng Ni Committed by Facebook GitHub Bot May 23, 2023
2 changed files
--- a/test/torchaudio_unittest/models/wav2vec2/huggingface_intergration_test.py
+++ b/test/torchaudio_unittest/models/wav2vec2/huggingface_intergration_test.py
@@ -144,6 +144,14 @@ class TestHFIntegration(TorchaudioTestCase):
        hyp = imported.encoder.transformer(x)
        self.assertEqual(ref, hyp)
+        # Test get_intermediate_outputs method
+        b, l, e = 16, 3, config["hidden_size"]
+        x = torch.randn(b, l, e)
+        ref = original.encoder(x, output_hidden_states=True).hidden_states
+        hyp = imported.encoder.transformer.get_intermediate_outputs(x)
+        for i in range(len(hyp)):
+            self.assertEqual(ref[i + 1], hyp[i], atol=1e-4, rtol=0.001)
    def _test_import_finetune(self, original, imported, config):
        # Aux
        x = torch.randn(3, 10, config["hidden_size"])
@@ -243,6 +251,14 @@ class TestHFIntegration(TorchaudioTestCase):
        hyp = imported.encoder.transformer(x)
        self.assertEqual(ref, hyp)
+        # Test get_intermediate_outputs method
+        b, l, e = 16, 3, config["hidden_size"]
+        x = torch.randn(b, l, e)
+        ref = original.encoder(x, output_hidden_states=True).hidden_states
+        hyp = imported.encoder.transformer.get_intermediate_outputs(x)
+        for i in range(len(hyp)):
+            self.assertEqual(ref[i + 1], hyp[i], atol=1e-4, rtol=0.001)
    def _test_recreate(self, imported, reloaded, config):
        # FeatureExtractor
        x = torch.randn(3, 1024)

--- a/torchaudio/models/wav2vec2/components.py
+++ b/torchaudio/models/wav2vec2/components.py
@@ -458,9 +458,10 @@ class Transformer(Module):
                raise ValueError(f"`num_layers` must be between [1, {len(self.layers)}]")
        ret: List[Tensor] = []
+        position_bias = None
        x = self._preprocess(x)
        for layer in self.layers:
-            x, _ = layer(x, attention_mask)  # Ignore position_bias
+            x, position_bias = layer(x, attention_mask, position_bias=position_bias)
            ret.append(x)
            if num_layers is not None and len(ret) >= num_layers:
                return ret