add inference pipeline from openfold/alphafold

42427a1c · Shenggan · b3b3b445 · 42427a1c · 42427a1c · 42427a1c
Commit 42427a1c authored Apr 14, 2022 by Shenggan
20 changed files
--- a/fastfold/data/tools/jackhmmer.py
+++ b/fastfold/data/tools/jackhmmer.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to run Jackhmmer from Python."""
+from concurrent import futures
+import glob
+import logging
+import os
+import subprocess
+from typing import Any, Callable, Mapping, Optional, Sequence
+from urllib import request
+from fastfold.data.tools import utils
+class Jackhmmer:
+    """Python wrapper of the Jackhmmer binary."""
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        database_path: str,
+        n_cpu: int = 8,
+        n_iter: int = 1,
+        e_value: float = 0.0001,
+        z_value: Optional[int] = None,
+        get_tblout: bool = False,
+        filter_f1: float = 0.0005,
+        filter_f2: float = 0.00005,
+        filter_f3: float = 0.0000005,
+        incdom_e: Optional[float] = None,
+        dom_e: Optional[float] = None,
+        num_streamed_chunks: Optional[int] = None,
+        streaming_callback: Optional[Callable[[int], None]] = None,
+    ):
+        """Initializes the Python Jackhmmer wrapper.
+        Args:
+          binary_path: The path to the jackhmmer executable.
+          database_path: The path to the jackhmmer database (FASTA format).
+          n_cpu: The number of CPUs to give Jackhmmer.
+          n_iter: The number of Jackhmmer iterations.
+          e_value: The E-value, see Jackhmmer docs for more details.
+          z_value: The Z-value, see Jackhmmer docs for more details.
+          get_tblout: Whether to save tblout string.
+          filter_f1: MSV and biased composition pre-filter, set to >1.0 to turn off.
+          filter_f2: Viterbi pre-filter, set to >1.0 to turn off.
+          filter_f3: Forward pre-filter, set to >1.0 to turn off.
+          incdom_e: Domain e-value criteria for inclusion of domains in MSA/next
+            round.
+          dom_e: Domain e-value criteria for inclusion in tblout.
+          num_streamed_chunks: Number of database chunks to stream over.
+          streaming_callback: Callback function run after each chunk iteration with
+            the iteration number as argument.
+        """
+        self.binary_path = binary_path
+        self.database_path = database_path
+        self.num_streamed_chunks = num_streamed_chunks
+        if (
+            not os.path.exists(self.database_path)
+            and num_streamed_chunks is None
+        ):
+            logging.error("Could not find Jackhmmer database %s", database_path)
+            raise ValueError(
+                f"Could not find Jackhmmer database {database_path}"
+            )
+        self.n_cpu = n_cpu
+        self.n_iter = n_iter
+        self.e_value = e_value
+        self.z_value = z_value
+        self.filter_f1 = filter_f1
+        self.filter_f2 = filter_f2
+        self.filter_f3 = filter_f3
+        self.incdom_e = incdom_e
+        self.dom_e = dom_e
+        self.get_tblout = get_tblout
+        self.streaming_callback = streaming_callback
+    def _query_chunk(
+        self, input_fasta_path: str, database_path: str
+    ) -> Mapping[str, Any]:
+        """Queries the database chunk using Jackhmmer."""
+        with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
+            sto_path = os.path.join(query_tmp_dir, "output.sto")
+            # The F1/F2/F3 are the expected proportion to pass each of the filtering
+            # stages (which get progressively more expensive), reducing these
+            # speeds up the pipeline at the expensive of sensitivity.  They are
+            # currently set very low to make querying Mgnify run in a reasonable
+            # amount of time.
+            cmd_flags = [
+                # Don't pollute stdout with Jackhmmer output.
+                "-o",
+                "/dev/null",
+                "-A",
+                sto_path,
+                "--noali",
+                "--F1",
+                str(self.filter_f1),
+                "--F2",
+                str(self.filter_f2),
+                "--F3",
+                str(self.filter_f3),
+                "--incE",
+                str(self.e_value),
+                # Report only sequences with E-values <= x in per-sequence output.
+                "-E",
+                str(self.e_value),
+                "--cpu",
+                str(self.n_cpu),
+                "-N",
+                str(self.n_iter),
+            ]
+            if self.get_tblout:
+                tblout_path = os.path.join(query_tmp_dir, "tblout.txt")
+                cmd_flags.extend(["--tblout", tblout_path])
+            if self.z_value:
+                cmd_flags.extend(["-Z", str(self.z_value)])
+            if self.dom_e is not None:
+                cmd_flags.extend(["--domE", str(self.dom_e)])
+            if self.incdom_e is not None:
+                cmd_flags.extend(["--incdomE", str(self.incdom_e)])
+            cmd = (
+                [self.binary_path]
+                + cmd_flags
+                + [input_fasta_path, database_path]
+            )
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing(
+                f"Jackhmmer ({os.path.basename(database_path)}) query"
+            ):
+                _, stderr = process.communicate()
+                retcode = process.wait()
+            if retcode:
+                raise RuntimeError(
+                    "Jackhmmer failed\nstderr:\n%s\n" % stderr.decode("utf-8")
+                )
+            # Get e-values for each target name
+            tbl = ""
+            if self.get_tblout:
+                with open(tblout_path) as f:
+                    tbl = f.read()
+            with open(sto_path) as f:
+                sto = f.read()
+        raw_output = dict(
+            sto=sto,
+            tbl=tbl,
+            stderr=stderr,
+            n_iter=self.n_iter,
+            e_value=self.e_value,
+        )
+        return raw_output
+    def query(self, input_fasta_path: str) -> Sequence[Mapping[str, Any]]:
+        """Queries the database using Jackhmmer."""
+        if self.num_streamed_chunks is None:
+            return [self._query_chunk(input_fasta_path, self.database_path)]
+        db_basename = os.path.basename(self.database_path)
+        db_remote_chunk = lambda db_idx: f"{self.database_path}.{db_idx}"
+        db_local_chunk = lambda db_idx: f"/tmp/ramdisk/{db_basename}.{db_idx}"
+        # Remove existing files to prevent OOM
+        for f in glob.glob(db_local_chunk("[0-9]*")):
+            try:
+                os.remove(f)
+            except OSError:
+                print(f"OSError while deleting {f}")
+        # Download the (i+1)-th chunk while Jackhmmer is running on the i-th chunk
+        with futures.ThreadPoolExecutor(max_workers=2) as executor:
+            chunked_output = []
+            for i in range(1, self.num_streamed_chunks + 1):
+                # Copy the chunk locally
+                if i == 1:
+                    future = executor.submit(
+                        request.urlretrieve,
+                        db_remote_chunk(i),
+                        db_local_chunk(i),
+                    )
+                if i < self.num_streamed_chunks:
+                    next_future = executor.submit(
+                        request.urlretrieve,
+                        db_remote_chunk(i + 1),
+                        db_local_chunk(i + 1),
+                    )
+                # Run Jackhmmer with the chunk
+                future.result()
+                chunked_output.append(
+                    self._query_chunk(input_fasta_path, db_local_chunk(i))
+                )
+                # Remove the local copy of the chunk
+                os.remove(db_local_chunk(i))
+                future = next_future
+                if self.streaming_callback:
+                    self.streaming_callback(i)
+        return chunked_output
--- a/fastfold/data/tools/kalign.py
+++ b/fastfold/data/tools/kalign.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A Python wrapper for Kalign."""
+import os
+import subprocess
+from typing import Sequence
+from absl import logging
+from fastfold.data.tools import utils
+def _to_a3m(sequences: Sequence[str]) -> str:
+    """Converts sequences to an a3m file."""
+    names = ["sequence %d" % i for i in range(1, len(sequences) + 1)]
+    a3m = []
+    for sequence, name in zip(sequences, names):
+        a3m.append(u">" + name + u"\n")
+        a3m.append(sequence + u"\n")
+    return "".join(a3m)
+class Kalign:
+    """Python wrapper of the Kalign binary."""
+    def __init__(self, *, binary_path: str):
+        """Initializes the Python Kalign wrapper.
+        Args:
+          binary_path: The path to the Kalign binary.
+        Raises:
+          RuntimeError: If Kalign binary not found within the path.
+        """
+        self.binary_path = binary_path
+    def align(self, sequences: Sequence[str]) -> str:
+        """Aligns the sequences and returns the alignment in A3M string.
+        Args:
+          sequences: A list of query sequence strings. The sequences have to be at
+            least 6 residues long (Kalign requires this). Note that the order in
+            which you give the sequences might alter the output slightly as
+            different alignment tree might get constructed.
+        Returns:
+          A string with the alignment in a3m format.
+        Raises:
+          RuntimeError: If Kalign fails.
+          ValueError: If any of the sequences is less than 6 residues long.
+        """
+        logging.info("Aligning %d sequences", len(sequences))
+        for s in sequences:
+            if len(s) < 6:
+                raise ValueError(
+                    "Kalign requires all sequences to be at least 6 "
+                    "residues long. Got %s (%d residues)." % (s, len(s))
+                )
+        with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
+            input_fasta_path = os.path.join(query_tmp_dir, "input.fasta")
+            output_a3m_path = os.path.join(query_tmp_dir, "output.a3m")
+            with open(input_fasta_path, "w") as f:
+                f.write(_to_a3m(sequences))
+            cmd = [
+                self.binary_path,
+                "-i",
+                input_fasta_path,
+                "-o",
+                output_a3m_path,
+                "-format",
+                "fasta",
+            ]
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing("Kalign query"):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+                logging.info(
+                    "Kalign stdout:\n%s\n\nstderr:\n%s\n",
+                    stdout.decode("utf-8"),
+                    stderr.decode("utf-8"),
+                )
+            if retcode:
+                raise RuntimeError(
+                    "Kalign failed\nstdout:\n%s\n\nstderr:\n%s\n"
+                    % (stdout.decode("utf-8"), stderr.decode("utf-8"))
+                )
+            with open(output_a3m_path) as f:
+                a3m = f.read()
+            return a3m
--- a/fastfold/data/tools/utils.py
+++ b/fastfold/data/tools/utils.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Common utilities for data pipeline tools."""
+import contextlib
+import datetime
+import logging
+import shutil
+import tempfile
+import time
+from typing import Optional
+@contextlib.contextmanager
+def tmpdir_manager(base_dir: Optional[str] = None):
+    """Context manager that deletes a temporary directory on exit."""
+    tmpdir = tempfile.mkdtemp(dir=base_dir)
+    try:
+        yield tmpdir
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
+@contextlib.contextmanager
+def timing(msg: str):
+    logging.info("Started %s", msg)
+    tic = time.perf_counter()
+    yield
+    toc = time.perf_counter()
+    logging.info("Finished %s in %.3f seconds", msg, toc - tic)
+def to_date(s: str):
+    return datetime.datetime(
+        year=int(s[:4]), month=int(s[5:7]), day=int(s[8:10])
+    )
--- a/fastfold/model/__init__.py
+++ b/fastfold/model/__init__.py
-from .msa import MSAStack
-from .ops import OutProductMean
-from .triangle import PairStack
-from .evoformer import Evoformer
-__all__ = ['MSAStack', 'OutProductMean', 'PairStack', 'Evoformer']
--- a/fastfold/model/fastnn/__init__.py
+++ b/fastfold/model/fastnn/__init__.py
+from .msa import MSAStack
+from .ops import OutProductMean
+from .triangle import PairStack
+from .evoformer import Evoformer
+__all__ = ['MSAStack', 'OutProductMean', 'PairStack', 'Evoformer']
--- a/fastfold/model/evoformer.py
+++ b/fastfold/model/evoformer.py
 import torch.nn as nn
 from fastfold.distributed.comm_async import All_to_All_Async, All_to_All_Async_Opp
-from fastfold.model import MSAStack, OutProductMean, PairStack
+from fastfold.model.fastnn import MSAStack, OutProductMean, PairStack
 class Evoformer(nn.Module):

--- a/fastfold/model/initializer.py
+++ b/fastfold/model/initializer.py
--- a/fastfold/model/kernel/__init__.py
+++ b/fastfold/model/kernel/__init__.py
--- a/fastfold/model/fastnn/kernel/cuda_native/__init__.py
+++ b/fastfold/model/fastnn/kernel/cuda_native/__init__.py
--- a/fastfold/model/kernel/cuda_native/csrc/compat.h
+++ b/fastfold/model/kernel/cuda_native/csrc/compat.h
--- a/fastfold/model/kernel/cuda_native/csrc/layer_norm_cuda.cpp
+++ b/fastfold/model/kernel/cuda_native/csrc/layer_norm_cuda.cpp
--- a/fastfold/model/kernel/cuda_native/csrc/layer_norm_cuda_kernel.cu
+++ b/fastfold/model/kernel/cuda_native/csrc/layer_norm_cuda_kernel.cu
--- a/fastfold/model/kernel/cuda_native/csrc/softmax_cuda.cpp
+++ b/fastfold/model/kernel/cuda_native/csrc/softmax_cuda.cpp
--- a/fastfold/model/kernel/cuda_native/csrc/softmax_cuda_kernel.cu
+++ b/fastfold/model/kernel/cuda_native/csrc/softmax_cuda_kernel.cu
--- a/fastfold/model/kernel/cuda_native/csrc/type_shim.h
+++ b/fastfold/model/kernel/cuda_native/csrc/type_shim.h
--- a/fastfold/model/kernel/cuda_native/layer_norm.py
+++ b/fastfold/model/kernel/cuda_native/layer_norm.py
--- a/fastfold/model/kernel/cuda_native/softmax.py
+++ b/fastfold/model/kernel/cuda_native/softmax.py
--- a/fastfold/model/kernel/jit/__init__.py
+++ b/fastfold/model/kernel/jit/__init__.py
--- a/fastfold/model/kernel/jit/fused_ops.py
+++ b/fastfold/model/kernel/jit/fused_ops.py
--- a/fastfold/model/kernel/jit/options.py
+++ b/fastfold/model/kernel/jit/options.py