Fix HuBERT xlarge configuration and test (#1811)

1. Fix the HuBERT xlarge model config 2. In the 48 transformer layers of HuBERT xlarge model, very few elements deviate from the equivalent model of fairseq, and exceeds the default atol 1e-5. This commit relax it to 3e-5 for the specific test.

Fix HuBERT xlarge configuration and test (#1811)
1. Fix the HuBERT xlarge model config 2. In the 48 transformer layers of HuBERT xlarge model, very few elements deviate from the equivalent model of fairseq, and exceeds the default atol 1e-5. This commit relax it to 3e-5 for the specific test.
13b2349a · moto · GitHub · d64648b6 · 13b2349a · 13b2349a
Unverified Commit 13b2349a authored Oct 01, 2021 by moto Committed by GitHub Oct 01, 2021
Showing with 17 additions and 17 deletions

test/torchaudio_unittest/models/wav2vec2/fairseq_integration_test.py ...udio_unittest/models/wav2vec2/fairseq_integration_test.py +14 -14

torchaudio/models/wav2vec2/model.py torchaudio/models/wav2vec2/model.py +3 -3

No files found.
--- a/test/torchaudio_unittest/models/wav2vec2/fairseq_integration_test.py
+++ b/test/torchaudio_unittest/models/wav2vec2/fairseq_integration_test.py
 import json
-import sys

 import torch
 from torchaudio.models.wav2vec2 import (
@@ -43,7 +42,7 @@ WAV2VEC2_LARGE_LV60K = _load_config('wav2vec_vox_new')
 WAV2VEC2_XLSR_53_56K = _load_config('xlsr_53_56k')
 HUBERT_BASE = _load_config('hubert_base_ls960')
 HUBERT_LARGE_LL60K = _load_config('hubert_large_ll60k')
-HUBERT_XLARGE_LL60K = _load_config('hubert_large_ll60k')
+HUBERT_XLARGE_LL60K = _load_config('hubert_xtralarge_ll60k')
 # Finetuning models
 WAV2VEC2_BASE_960H = _load_config('wav2vec_small_960h')
 WAV2VEC2_LARGE_960H = _load_config('wav2vec_large_960h')
@@ -137,14 +136,8 @@ class TestFairseqIntegration(TorchaudioTestCase):
    def test_import_wave2vec2_pretraining_model(self, config, _):
        """Wav2vec2 pretraining models from fairseq can be imported and yields the same results"""
        batch_size, num_frames = 3, 1024
-        atol = 1.1e-05 if sys.platform == "darwin" else 1e-05
-        # macOS CI jobs fails dues to very small descrepency
-        # AssertionError: False is not true : Tensors failed to compare as equal!
-        # With rtol=1.3e-06 and atol=1e-05, found 1 element(s) (out of 6144)
-        # whose difference(s) exceeded the margin of error (including 0 nan comparisons).
-        # The greatest difference was 1.0967254638671875e-05 (-0.12493154406547546 vs.
-        # -0.12494251132011414), which occurred at index (1, 1, 169).

+        torch.manual_seed(0)
        original = self._get_model(config).eval()
        imported = import_fairseq_model(original).eval()

@@ -152,22 +145,29 @@ class TestFairseqIntegration(TorchaudioTestCase):
        hyp, _ = imported.extract_features(x)
        refs = original.extract_features(x, padding_mask=torch.zeros_like(x), layer=-1)
        for i, (ref, _) in enumerate(refs['layer_results']):
-            self.assertEqual(hyp[i], ref.transpose(0, 1), atol=atol, rtol=1.3e-06)
+            self.assertEqual(hyp[i], ref.transpose(0, 1))

    @HUBERT_PRETRAINING_CONFIGS
-    def test_import_hubert_pretraining_model(self, config, _):
+    def test_import_hubert_pretraining_model(self, config, factory_func):
        """HuBERT pretraining models from fairseq can be imported and yields the same results"""
        batch_size, num_frames = 3, 1024

+        torch.manual_seed(0)
        original = self._get_model(config).eval()
        imported = import_fairseq_model(original).eval()

        x = torch.randn(batch_size, num_frames)
        mask = torch.zeros_like(x)
        hyp, _ = imported.extract_features(x)
-        for i in range(len(original.encoder.layers)):
-            ref, _ = original.extract_features(x, padding_mask=mask, output_layer=i + 1)
-            self.assertEqual(hyp[i], ref)
+
+        # check the last layer
+        ref, _ = original.extract_features(x, padding_mask=mask, output_layer=len(original.encoder.layers))
+        atol = 3.0e-05 if factory_func is hubert_xlarge else 1.0e-5
+        self.assertEqual(hyp[-1], ref, atol=atol, rtol=1.3e-6)
+
+        # check the first layer
+        ref, _ = original.extract_features(x, padding_mask=mask, output_layer=1)
+        self.assertEqual(hyp[0], ref)

    @ALL_PRETRAINING_CONFIGS
    def test_recreate_pretraining_model(self, config, factory_func):

--- a/torchaudio/models/wav2vec2/model.py
+++ b/torchaudio/models/wav2vec2/model.py
@@ -435,14 +435,14 @@ def hubert_xlarge() -> Wav2Vec2Model:
        extractor_mode='layer_norm',
        extractor_conv_layer_config=None,
        extractor_conv_bias=False,
-        encoder_embed_dim=1024,
+        encoder_embed_dim=1280,
        encoder_projection_dropout=0.0,
        encoder_pos_conv_kernel=128,
        encoder_pos_conv_groups=16,
-        encoder_num_layers=24,
+        encoder_num_layers=48,
        encoder_num_heads=16,
        encoder_attention_dropout=0.0,
-        encoder_ff_interm_features=4096,
+        encoder_ff_interm_features=5120,
        encoder_ff_interm_dropout=0.0,
        encoder_dropout=0.0,
        encoder_layer_norm_first=True,