v1.0

39ac40a9 · chenzk · 39ac40a9 · 39ac40a9 · 39ac40a9 · 39ac40a9
Commit 39ac40a9 authored Jun 06, 2025 by chenzk
20 changed files
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/David_Faustino/xTOk1Jz-F_g_0000015.wav
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/David_Faustino/xTOk1Jz-F_g_0000015.wav
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/Josh_Gad/HXUqYaOwrxA_0000015.wav
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/Josh_Gad/HXUqYaOwrxA_0000015.wav
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/Josh_Gad/RFyw7V3SOnQ_0000001.wav
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/Josh_Gad/RFyw7V3SOnQ_0000001.wav
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/Lea_Thompson/HladKGyKTLM_0000006.wav
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/Lea_Thompson/HladKGyKTLM_0000006.wav
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/Lea_Thompson/mHTAr5dlAgc_0000004.wav
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/Lea_Thompson/mHTAr5dlAgc_0000004.wav
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/Zulay_Henao/WbB8m9-wlIQ_0000001.wav
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/Zulay_Henao/WbB8m9-wlIQ_0000001.wav
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/Zulay_Henao/gFfcgOVmiO0_0000002.wav
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/vox1_data/Zulay_Henao/gFfcgOVmiO0_0000002.wav
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/wget-log
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/downstreams/speaker_verification/wget-log
+--2023-05-04 23:05:23--  https://msranlcmtteamdrive.blob.core.windows.net/share/wavlm/WavLM-Large.pt?sv=2020-08-04
+Resolving bj-rd-proxy.byted.org (bj-rd-proxy.byted.org)... fdbd:dc02:fe:201b::1, 10.8.6.28
+Connecting to bj-rd-proxy.byted.org (bj-rd-proxy.byted.org)|fdbd:dc02:fe:201b::1|:3128... connected.
+Proxy tunneling failed: Origin DNS ErrorUnable to establish SSL connection.
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/src/CODE_OF_CONDUCT.md
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/src/CODE_OF_CONDUCT.md
+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <conduct@pytorch.org>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/src/CONTRIBUTING.md
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/src/CONTRIBUTING.md
+# Contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq)
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `master`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+## License
+By contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq),
+you agree that your contributions will be licensed under the LICENSE file in
+the root directory of this source tree.
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/src/LICENSE
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/src/LICENSE
+MIT License
+Copyright (c) Facebook, Inc. and its affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/config.yaml
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/config.yaml
+# @package _group_
+common:
+    no_progress_bar: false
+    log_interval: 100
+    log_format: null
+    tensorboard_logdir: null
+    seed: 1
+    cpu: false
+    tpu: false
+    bf16: false
+    fp16: false
+    memory_efficient_fp16: false
+    memory_efficient_bf16: false
+    fp16_no_flatten_grads: false
+    fp16_init_scale: 128
+    fp16_scale_window: null
+    fp16_scale_tolerance: 0.0
+    min_loss_scale: 1.0e-4
+    threshold_loss_scale: null
+    user_dir: null
+    empty_cache_freq: 0
+    all_gather_list_size: 16384
+    model_parallel_size: 1
+    quantization_config_path: null
+    profile: false
+distributed_training:
+    distributed_rank: 0
+    distributed_backend: "nccl"
+    distributed_init_method: null
+    distributed_port: -1
+    device_id: 0
+    local_rank: 0
+    distributed_no_spawn: false
+    ddp_backend: "c10d"
+    bucket_cap_mb: 25
+    fix_batches_to_gpus: false
+    find_unused_parameters: false
+    fast_stat_sync: false
+    broadcast_buffers: false
+    distributed_wrapper: "DDP"
+    slowmo_momentum: null
+    slowmo_algorithm: "LocalSGD"
+    localsgd_frequency: 3
+dataset:
+    num_workers: 1
+    skip_invalid_size_inputs_valid_test: false
+    max_tokens: null
+    batch_size: null
+    required_batch_size_multiple: 8
+    dataset_impl: null
+    data_buffer_size: 10
+    train_subset: "train"
+    valid_subset: "valid"
+    validate_interval: 1
+    fixed_validation_seed: null
+    disable_validation: false
+    curriculum: 0
+    gen_subset: "test"
+    num_shards: 1
+    shard_id: 0
+    max_tokens_valid: ${dataset.max_tokens}
+    batch_size_valid: ${dataset.batch_size}
+optimization:
+    max_epoch: 0
+    max_update: 0
+    clip_norm: 25.0
+    sentence_avg: false
+    update_freq: [ 1 ]
+    lr: [ 0.25 ]
+    min_lr: -1.0
+    use_bmuf: false
+checkpoint:
+    save_dir: "checkpoints"
+    restore_file: "checkpoint_last.pt"
+    reset_dataloader: false
+    reset_lr_scheduler: false
+    reset_meters: false
+    reset_optimizer: false
+    optimizer_overrides: "{}"
+    save_interval: 1
+    save_interval_updates: 0
+    keep_interval_updates: -1
+    keep_last_epochs: -1
+    keep_best_checkpoints: -1
+    no_save: false
+    no_epoch_checkpoints: false
+    no_last_checkpoints: false
+    no_save_optimizer_state: false
+    best_checkpoint_metric: "loss"
+    maximize_best_checkpoint_metric: false
+    patience: -1
+    checkpoint_suffix: ""
+bmuf:
+    block_lr: 1
+    block_momentum: 0.875
+    global_sync_iter: 50
+    warmup_iterations: 500
+    use_nbm: false
+    average_sync: false
+defaults:
+    - task: language_modeling
+    - model: null
+    - criterion: null
+    - optimizer: null
+    - lr_scheduler: null
+    - bpe: null
+    - tokenizer: null
+    - scoring: null
+    - generation: null
+    - common_eval: null
+    - eval_lm: null
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/criterion/adaptive_loss.yaml
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/criterion/adaptive_loss.yaml
+# @package _group_
+sentence_avg: ${optimization.sentence_avg}
+ddp_backend: ${distributed_training.ddp_backend}
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/criterion/cross_entropy.yaml
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/criterion/cross_entropy.yaml
+# @package _group_
+sentence_avg: ${optimization.sentence_avg}
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/lr_scheduler/cosine.yaml
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/lr_scheduler/cosine.yaml
+# @package _group_
+warmup_updates: 0
+warmup_init_lr: -1
+max_lr: 1.0
+t_mult: 1.0
+lr_period_updates: -1
+lr_shrink: 0.1
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/lr_scheduler/inverse_sqrt.yaml
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/lr_scheduler/inverse_sqrt.yaml
+# @package _group_
+warmup_updates: 4000
+warmup_init_lr: -1
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/model/transformer_lm.yaml
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/model/transformer_lm.yaml
+# @package _group_
+activation_fn: "relu"
+dropout: 0.1
+attention_dropout: 0.0
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 512
+decoder_output_dim: 512
+decoder_input_dim: 512
+decoder_ffn_embed_dim: 2048
+decoder_layers: 6
+decoder_attention_heads: 8
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/model/transformer_lm_baevski_gbw.yaml
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/model/transformer_lm_baevski_gbw.yaml
+# @package _group_
+activation_fn: "relu"
+dropout: 0.1
+attention_dropout: 0.1
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 512
+decoder_output_dim: 512
+decoder_input_dim: 512
+decoder_ffn_embed_dim: 4096
+decoder_layers: 12
+decoder_attention_heads: 16
+decoder_normalize_before: true
+no_decoder_final_norm: true
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/model/transformer_lm_baevski_wiki103.yaml
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/model/transformer_lm_baevski_wiki103.yaml
+# @package _group_
+activation_fn: "relu"
+dropout: 0.3
+attention_dropout: 0.1
+activation_dropout: 0.1
+relu_dropout: 0.1
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 16
+decoder_attention_heads: 8
+decoder_normalize_before: true
+no_decoder_final_norm: true
+adaptive_softmax_cutoff: "20000,60000"
+adaptive_softmax_dropout: 0.2
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: true
+adaptive_input_factor: 4
+adaptive_input_cutoff: "20000,60000"
+tie_adaptive_weights: true
+tie_adaptive_proj: true
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0
--- a/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/model/transformer_lm_big.yaml
+++ b/third_party/seed-tts-eval/thirdparty/UniSpeech/src/config/model/transformer_lm_big.yaml
+# @package _group_
+activation_fn: "relu"
+dropout: 0.1
+attention_dropout: 0.0
+activation_dropout: 0.0
+relu_dropout: 0.0
+decoder_embed_dim: 1024
+decoder_output_dim: 1024
+decoder_input_dim: 1024
+decoder_ffn_embed_dim: 4096
+decoder_layers: 12
+decoder_attention_heads: 16
+decoder_normalize_before: true
+no_decoder_final_norm: false
+adaptive_softmax_cutoff: null
+adaptive_softmax_dropout: 0
+adaptive_softmax_factor: 4
+no_token_positional_embeddings: false
+share_decoder_input_output_embed: false
+character_embeddings: false
+character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
+character_embedding_dim: 4
+char_embedder_highway_layers: 2
+adaptive_input: false
+adaptive_input_factor: 4
+adaptive_input_cutoff: null
+tie_adaptive_weights: false
+tie_adaptive_proj: false
+decoder_learned_pos: false
+decoder_layerdrop: 0
+decoder_layers_to_keep: null
+layernorm_embedding: false
+no_scale_embedding: false
+quant_noise_pq: 0
+quant_noise_pq_block_size: 8
+quant_noise_scalar: 0