Merge

fb341b17 · Gustaf Ahdritz · f30d77b7 · 0067da9f · fb341b17 · fb341b17
Commit fb341b17 authored Jan 27, 2022 by Gustaf Ahdritz
7 changed files
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ To install the HH-suite to `/usr/bin`, run
 To download DeepMind's pretrained parameters and common ground truth data, run:
 ```bash
-scripts/download_data.sh data/
+bash scripts/download_data.sh data/
 ```
 You have two choices for downloading protein databases, depending on whether 
@@ -70,14 +70,14 @@ you want to use DeepMind's MSA generation pipeline (w/ HMMR & HHblits) or
 MMseqs2 instead. For the former, run:
 ```bash
-scripts/download_alphafold_dbs.sh data/
+bash scripts/download_alphafold_dbs.sh data/
 ```
 For the latter, run:
 ```bash
-scripts/download_mmseqs_databases.sh data/    # downloads .tar files
+bash scripts/download_mmseqs_dbs.sh data/    # downloads .tar files
-scripts/prep_mmseqs_databases.sh data/        # unpacks and preps the databases
+bash scripts/prep_mmseqs_dbs.sh data/        # unpacks and preps the databases
 ```
 Make sure to run the latter command on the machine that will be used for MSA

--- a/environment.yml
+++ b/environment.yml
+name: openfold_venv
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+dependencies:
+  - pip:
+      - biopython==1.79
+      - deepspeed==0.5.3
+      - dm-tree==0.1.6
+      - ml-collections==0.1.0
+      - numpy==1.21.2
+      - PyYAML==5.4.1
+      - requests==2.26.0
+      - scipy==1.7.1
+      - tqdm==4.62.2
+      - typing-extensions==3.10.0.2
+      - pytorch_lightning==1.5.0
+      - nvidia-pyindex
+      - nvidia-dllogger
+  - pytorch::pytorch=1.10.*
+  - conda-forge::python=3.7
+  - conda-forge::setuptools=59.5.0
+  - conda-forge::pip
+  - conda-forge::openmm=7.5.1
+  - conda-forge::pdbfixer
+  - bioconda::aria2
+  - bioconda::hmmer==3.3.2
+  - bioconda::hhsuite==3.3.0
+  - bioconda::kalign2==2.04
--- a/openfold/data/data_modules.py
+++ b/openfold/data/data_modules.py
@@ -370,7 +370,7 @@ class OpenFoldDataLoader(torch.utils.data.DataLoader):
            keyed_probs.append(
                ("use_clamped_fape", [1 - clamp_prob, clamp_prob])
            )
        if(stage_cfg.uniform_recycling):
            recycling_probs = [
                1. / (max_iters + 1) for _ in range(max_iters + 1)
@@ -380,7 +380,7 @@ class OpenFoldDataLoader(torch.utils.data.DataLoader):
                0. for _ in range(max_iters + 1)
            ]
            recycling_probs[-1] = 1.
        keyed_probs.append(
            ("no_recycling_iters", recycling_probs)
        )

--- a/openfold/utils/loss.py
+++ b/openfold/utils/loss.py
@@ -1574,4 +1574,10 @@ class AlphaFoldLoss(nn.Module):
        crop_len = batch["aatype"].shape[-1]
        cum_loss = cum_loss * torch.sqrt(min(seq_len, crop_len))
+        # Scale the loss by the square root of the minimum of the crop size and
+        # the (average) sequence length. See subsection 1.9.
+        seq_len = torch.mean(batch["seq_length"].float())
+        crop_len = batch["aatype"].shape[-1]
+        cum_loss = cum_loss * torch.sqrt(min(seq_len, crop_len))
        return cum_loss
--- a/requirements.txt
+++ b/requirements.txt
-biopython==1.79
-deepspeed==0.5.3
-dm-tree==0.1.6
-ml-collections==0.1.0
-numpy==1.21.2
-PyYAML==5.4.1
-requests==2.26.0
-scipy==1.7.1
-torch==1.10.0
-tqdm==4.62.2
-typing-extensions==3.10.0.2
-pytorch_lightning==1.5.0
-git+git://github.com/NVIDIA/dllogger.git
--- a/scripts/install_third_party_dependencies.sh
+++ b/scripts/install_third_party_dependencies.sh
 #!/bin/bash
+CONDA_INSTALL_URL=${CONDA_INSTALL_URL:-"https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh"}
 source scripts/vars.sh
 # Install Miniconda locally
 rm -rf lib/conda
 rm -f /tmp/Miniconda3-latest-Linux-x86_64.sh
-wget -q -P /tmp \
+wget -P /tmp \
-    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+    "${CONDA_INSTALL_URL}" \
    && bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p lib/conda \
    && rm /tmp/Miniconda3-latest-Linux-x86_64.sh
 # Grab conda-only packages
-PATH=lib/conda/bin:$PATH
+export PATH=lib/conda/bin:$PATH
-conda update -qy conda \
+conda env create --name=${ENV_NAME} -f environment.yml
-    && conda create --name $ENV_NAME -y python==3.7 \
+source activate ${ENV_NAME}
-    && source lib/conda/etc/profile.d/conda.sh \
-    && conda activate $ENV_NAME \
-    && pip install -r requirements.txt \
-    && conda install -qy -c conda-forge \
-      openmm=7.5.1 \
-      pdbfixer
-# Comment out if you have these already installed on your system, for example in /usr/bin/
-conda install -c bioconda aria2
-conda install -y -c bioconda hmmer==3.3.2 hhsuite==3.3.0 kalign2==2.04
-pip install nvidia-pyindex
-pip install nvidia-dllogger
 # Install DeepMind's OpenMM patch
 OPENFOLD_DIR=$PWD

--- a/train_openfold.py
+++ b/train_openfold.py
@@ -97,7 +97,7 @@ class OpenFoldWrapper(pl.LightningModule):
    def configure_optimizers(self, 
        learning_rate: float = 1e-3,
-        eps: float = 1e-5
+        eps: float = 1e-5,
    ) -> torch.optim.Adam:
        # Ignored as long as a DeepSpeed optimizer is configured
        return torch.optim.Adam(
@@ -293,6 +293,11 @@ if __name__ == "__main__":
        "--distillation_mapping_path", type=str, default=None,
        help="""See --train_mapping_path"""
    )
+    parser.add_argument(
+        "--obsolete_pdbs_file_path", type=str, default=None,
+        help="""Path to obsolete.dat file containing list of obsolete PDBs and 
+             their replacements."""
+    )
    parser.add_argument(
        "--template_release_dates_cache_path", type=str, default=None,
        help="""Output of scripts/generate_mmcif_cache.py run on template mmCIF