Add Frechet distance function (#3545)

Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/3545 Adds function for computing the Fréchet distance between two multivariate normal distributions. Reviewed By: mthrok Differential Revision: D48126102 fbshipit-source-id: e4e122b831e1e752037c03f5baa9451e81ef1697

Add Frechet distance function (#3545)
Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/3545 Adds function for computing the Fréchet distance between two multivariate normal distributions. Reviewed By: mthrok Differential Revision: D48126102 fbshipit-source-id: e4e122b831e1e752037c03f5baa9451e81ef1697
06301c0a · Jeff Hwang · Facebook GitHub Bot · 8d858c38 · 06301c0a · 06301c0a
Commit 06301c0a authored Aug 10, 2023 by Jeff Hwang Committed by Facebook GitHub Bot Aug 10, 2023
6 changed files
--- a/docs/source/functional.rst
+++ b/docs/source/functional.rst
@@ -32,6 +32,7 @@ Utility
   preemphasis
   deemphasis
   speed
+   frechet_distance
 Forced Alignment
 ----------------

--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -66,7 +66,7 @@
  year         = {2017}
 }
 @misc{conneau2020unsupervised,
-      title={Unsupervised Cross-lingual Representation Learning for Speech Recognition}, 
+      title={Unsupervised Cross-lingual Representation Learning for Speech Recognition},
      author={Alexis Conneau and Alexei Baevski and Ronan Collobert and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.13979},
@@ -80,7 +80,7 @@
  year={2014}
 }
 @misc{ardila2020common,
-      title={Common Voice: A Massively-Multilingual Speech Corpus}, 
+      title={Common Voice: A Massively-Multilingual Speech Corpus},
      author={Rosana Ardila and Megan Branson and Kelly Davis and Michael Henretty and Michael Kohler and Josh Meyer and Reuben Morais and Lindsay Saunders and Francis M. Tyers and Gregor Weber},
      year={2020},
      eprint={1912.06670},
@@ -99,16 +99,16 @@
 }
 @INPROCEEDINGS{librilight,
  author={J. {Kahn} and M. {Rivière} and W. {Zheng} and E. {Kharitonov} and Q. {Xu} and P. E. {Mazaré} and J. {Karadayi} and V. {Liptchinsky} and R. {Collobert} and C. {Fuegen} and T. {Likhomanenko} and G. {Synnaeve} and A. {Joulin} and A. {Mohamed} and E. {Dupoux}},
-  booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+  booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
-  title={Libri-Light: A Benchmark for ASR with Limited or No Supervision}, 
+  title={Libri-Light: A Benchmark for ASR with Limited or No Supervision},
  year={2020},
  pages={7669-7673},
  note = {\url{https://github.com/facebookresearch/libri-light}},
 }
 @INPROCEEDINGS{7178964,
  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
-  booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+  booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
-  title={Librispeech: An ASR corpus based on public domain audio books}, 
+  title={Librispeech: An ASR corpus based on public domain audio books},
  year={2015},
  volume={},
  number={},
@@ -122,7 +122,7 @@
  year = {2019},
 }
 @misc{baevski2020wav2vec,
-      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations}, 
+      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations},
      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
      year={2020},
      eprint={2006.11477},
@@ -130,7 +130,7 @@
      primaryClass={cs.CL}
 }
 @misc{hsu2021hubert,
-      title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units}, 
+      title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units},
      author={Wei-Ning Hsu and Benjamin Bolte and Yao-Hung Hubert Tsai and Kushal Lakhotia and Ruslan Salakhutdinov and Abdelrahman Mohamed},
      year={2021},
      eprint={2106.07447},
@@ -138,7 +138,7 @@
      primaryClass={cs.CL}
 }
 @misc{hannun2014deep,
-      title={Deep Speech: Scaling up end-to-end speech recognition}, 
+      title={Deep Speech: Scaling up end-to-end speech recognition},
      author={Awni Hannun and Carl Case and Jared Casper and Bryan Catanzaro and Greg Diamos and Erich Elsen and Ryan Prenger and Sanjeev Satheesh and Shubho Sengupta and Adam Coates and Andrew Y. Ng},
      year={2014},
      eprint={1412.5567},
@@ -146,7 +146,7 @@
      primaryClass={cs.CL}
 }
 @misc{graves2012sequence,
-      title={Sequence Transduction with Recurrent Neural Networks}, 
+      title={Sequence Transduction with Recurrent Neural Networks},
      author={Alex Graves},
      year={2012},
      eprint={1211.3711},
@@ -154,7 +154,7 @@
      primaryClass={cs.NE}
 }
 @misc{collobert2016wav2letter,
-      title={Wav2Letter: an End-to-End ConvNet-based Speech Recognition System}, 
+      title={Wav2Letter: an End-to-End ConvNet-based Speech Recognition System},
      author={Ronan Collobert and Christian Puhrsch and Gabriel Synnaeve},
      year={2016},
      eprint={1609.03193},
@@ -162,7 +162,7 @@
      primaryClass={cs.LG}
 }
 @misc{kalchbrenner2018efficient,
-      title={Efficient Neural Audio Synthesis}, 
+      title={Efficient Neural Audio Synthesis},
      author={Nal Kalchbrenner and Erich Elsen and Karen Simonyan and Seb Noury and Norman Casagrande and Edward Lockhart and Florian Stimberg and Aaron van den Oord and Sander Dieleman and Koray Kavukcuoglu},
      year={2018},
      eprint={1802.08435},
@@ -202,8 +202,8 @@
 }
 @INPROCEEDINGS{6701851,
  author={Perraudin, Nathanaël and Balazs, Peter and Søndergaard, Peter L.},
-  booktitle={2013 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics}, 
+  booktitle={2013 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics},
-  title={A fast Griffin-Lim algorithm}, 
+  title={A fast Griffin-Lim algorithm},
  year={2013},
  volume={},
  number={},
@@ -211,8 +211,8 @@
  doi={10.1109/WASPAA.2013.6701851}}
 @INPROCEEDINGS{1172092,
  author={Griffin, D. and Jae Lim},
-  booktitle={ICASSP '83. IEEE International Conference on Acoustics, Speech, and Signal Processing}, 
+  booktitle={ICASSP '83. IEEE International Conference on Acoustics, Speech, and Signal Processing},
-  title={Signal estimation from modified short-time Fourier transform}, 
+  title={Signal estimation from modified short-time Fourier transform},
  year={1983},
  volume={8},
  number={},
@@ -220,8 +220,8 @@
  doi={10.1109/ICASSP.1983.1172092}}
 @INPROCEEDINGS{6854049,
  author={Ghahremani, Pegah and BabaAli, Bagher and Povey, Daniel and Riedhammer, Korbinian and Trmal, Jan and Khudanpur, Sanjeev},
-  booktitle={2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+  booktitle={2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
-  title={A pitch extraction algorithm tuned for automatic speech recognition}, 
+  title={A pitch extraction algorithm tuned for automatic speech recognition},
  year={2014},
  volume={},
  number={},
@@ -254,16 +254,16 @@
  organization={IEEE}
 }
 @inproceedings{shi2021emformer,
-  title={Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition}, 
+  title={Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition},
  author={Shi, Yangyang and Wang, Yongqiang and Wu, Chunyang and Yeh, Ching-Feng and Chan, Julian and Zhang, Frank and Le, Duc and Seltzer, Mike},
-  booktitle={ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+  booktitle={ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={6783-6787},
  year={2021}
 }
 @inproceedings{9747706,
  author={Shi, Yangyang and Wu, Chunyang and Wang, Dilin and Xiao, Alex and Mahadeokar, Jay and Zhang, Xiaohui and Liu, Chunxi and Li, Ke and Shangguan, Yuan and Nagaraja, Varun and Kalinli, Ozlem and Seltzer, Mike},
-  booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+  booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
-  title={Streaming Transformer Transducer based Speech Recognition Using Non-Causal Convolution}, 
+  title={Streaming Transformer Transducer based Speech Recognition Using Non-Causal Convolution},
  year={2022},
  volume={},
  number={},
@@ -441,8 +441,8 @@ abstract = {End-to-end spoken language translation (SLT) has recently gained pop
 }
 @INPROCEEDINGS{9746490,
  author={Srivastava, Sangeeta and Wang, Yun and Tjandra, Andros and Kumar, Anurag and Liu, Chunxi and Singh, Kritika and Saraf, Yatharth},
-  booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+  booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
-  title={Conformer-Based Self-Supervised Learning For Non-Speech Audio Tasks}, 
+  title={Conformer-Based Self-Supervised Learning For Non-Speech Audio Tasks},
  year={2022},
  volume={},
  number={},
@@ -579,3 +579,14 @@ booktitle	= {International Conference on Acoustics, Speech and Signal Processing
      archivePrefix={arXiv},
      primaryClass={cs.CL}
 }
+@article{dowson1982frechet,
+  title={The Fr{\'e}chet distance between multivariate normal distributions},
+  author={Dowson, DC and Landau, BV666017},
+  journal={Journal of multivariate analysis},
+  volume={12},
+  number={3},
+  pages={450--455},
+  year={1982},
+  publisher={Elsevier}
+}
--- a/test/torchaudio_unittest/functional/autograd_impl.py
+++ b/test/torchaudio_unittest/functional/autograd_impl.py
@@ -383,6 +383,14 @@ class Autograd(TestBaseMixin):
        coeff = 0.9
        self.assert_grad(F.deemphasis, (waveform, coeff))
+    def test_frechet_distance(self):
+        N = 16
+        mu_x = torch.rand((N,))
+        sigma_x = torch.rand((N, N))
+        mu_y = torch.rand((N,))
+        sigma_y = torch.rand((N, N))
+        self.assert_grad(F.frechet_distance, (mu_x, sigma_x, mu_y, sigma_y))
 class AutogradFloat32(TestBaseMixin):
    def assert_grad(

--- a/test/torchaudio_unittest/functional/functional_impl.py
+++ b/test/torchaudio_unittest/functional/functional_impl.py
@@ -1282,6 +1282,38 @@ class Functional(TestBaseMixin):
            spans = F.merge_tokens(tokens_, scores_, blank=0)
            self._assert_tokens(spans, expected_)
+    def test_frechet_distance_univariate(self):
+        r"""Check that Frechet distance is computed correctly for simple one-dimensional case."""
+        mu_x = torch.rand((1,), device=self.device)
+        sigma_x = torch.rand((1, 1), device=self.device)
+        mu_y = torch.rand((1,), device=self.device)
+        sigma_y = torch.rand((1, 1), device=self.device)
+        # Matrix square root reduces to scalar square root.
+        expected = (mu_x - mu_y) ** 2 + sigma_x + sigma_y - 2 * torch.sqrt(sigma_x * sigma_y)
+        expected = expected.item()
+        actual = F.frechet_distance(mu_x, sigma_x, mu_y, sigma_y)
+        self.assertEqual(expected, actual)
+    def test_frechet_distance_diagonal_covariance(self):
+        r"""Check that Frechet distance is computed correctly for case where covariance matrices are diagonal."""
+        N = 15
+        mu_x = torch.rand((N,), device=self.device)
+        sigma_x = torch.diag(torch.rand((N,), device=self.device))
+        mu_y = torch.rand((N,), device=self.device)
+        sigma_y = torch.diag(torch.rand((N,), device=self.device))
+        expected = (
+            torch.sum((mu_x - mu_y) ** 2) + torch.sum(sigma_x + sigma_y) - 2 * torch.sum(torch.sqrt(sigma_x * sigma_y))
+        )
+        expected = expected.item()
+        actual = F.frechet_distance(mu_x, sigma_x, mu_y, sigma_y)
+        self.assertEqual(expected, actual)
 class FunctionalCPUOnly(TestBaseMixin):
    def test_melscale_fbanks_no_warning_high_n_freq(self):

--- a/torchaudio/functional/__init__.py
+++ b/torchaudio/functional/__init__.py
@@ -36,6 +36,7 @@ from .functional import (
    detect_pitch_frequency,
    edit_distance,
    fftconvolve,
+    frechet_distance,
    griffinlim,
    inverse_spectrogram,
    linear_fbanks,
@@ -122,4 +123,5 @@ __all__ = [
    "speed",
    "preemphasis",
    "deemphasis",
+    "frechet_distance",
 ]
--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -2499,3 +2499,41 @@ def deemphasis(waveform, coeff: float = 0.97) -> torch.Tensor:
    a_coeffs = torch.tensor([1.0, -coeff], dtype=waveform.dtype, device=waveform.device)
    b_coeffs = torch.tensor([1.0, 0.0], dtype=waveform.dtype, device=waveform.device)
    return torchaudio.functional.lfilter(waveform, a_coeffs=a_coeffs, b_coeffs=b_coeffs)
+def frechet_distance(mu_x, sigma_x, mu_y, sigma_y):
+    r"""Computes the Fréchet distance between two multivariate normal distributions :cite:`dowson1982frechet`.
+    Concretely, for multivariate Gaussians :math:`X(\mu_X, \Sigma_X)`
+    and :math:`Y(\mu_Y, \Sigma_Y)`, the function computes and returns :math:`F` as
+    .. math::
+        F(X, Y) = || \mu_X - \mu_Y ||_2^2
+        + \text{Tr}\left( \Sigma_X + \Sigma_Y - 2 \sqrt{\Sigma_X \Sigma_Y} \right)
+    Args:
+        mu_x (torch.Tensor): mean :math:`\mu_X` of multivariate Gaussian :math:`X`, with shape `(N,)`.
+        sigma_x (torch.Tensor): covariance matrix :math:`\Sigma_X` of :math:`X`, with shape `(N, N)`.
+        mu_y (torch.Tensor): mean :math:`\mu_Y` of multivariate Gaussian :math:`Y`, with shape `(N,)`.
+        sigma_y (torch.Tensor): covariance matrix :math:`\Sigma_Y` of :math:`Y`, with shape `(N, N)`.
+    Returns:
+        torch.Tensor: the Fréchet distance between :math:`X` and :math:`Y`.
+    """
+    if len(mu_x.size()) != 1:
+        raise ValueError(f"Input mu_x must be one-dimensional; got dimension {len(mu_x.size())}.")
+    if len(sigma_x.size()) != 2:
+        raise ValueError(f"Input sigma_x must be two-dimensional; got dimension {len(sigma_x.size())}.")
+    if sigma_x.size(0) != sigma_x.size(1) != mu_x.size(0):
+        raise ValueError("Each of sigma_x's dimensions must match mu_x's size.")
+    if mu_x.size() != mu_y.size():
+        raise ValueError(f"Inputs mu_x and mu_y must have the same shape; got {mu_x.size()} and {mu_y.size()}.")
+    if sigma_x.size() != sigma_y.size():
+        raise ValueError(
+            f"Inputs sigma_x and sigma_y must have the same shape; got {sigma_x.size()} and {sigma_y.size()}."
+        )
+    a = (mu_x - mu_y).square().sum()
+    b = sigma_x.trace() + sigma_y.trace()
+    c = torch.linalg.eigvals(sigma_x @ sigma_y).sqrt().real.sum()
+    return a + b - 2 * c