hhsearch.py 2.56 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
"""Library to run HHsearch from Python."""

import glob
import os
import subprocess
from typing import Sequence

from absl import logging


11
from openfold.features.np import utils
12
13
14
15
16
17
18
19
20


class HHSearch:
  """Python wrapper of the HHsearch binary."""

  def __init__(self,
               *,
               binary_path: str,
               databases: Sequence[str],
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
21
               n_cpu: int = 2,
22
23
24
25
26
27
28
29
               maxseq: int = 1_000_000):
    """Initializes the Python HHsearch wrapper.

    Args:
      binary_path: The path to the HHsearch executable.
      databases: A sequence of HHsearch database paths. This should be the
        common prefix for the database files (i.e. up to but not including
        _hhm.ffindex etc.)
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
30
      n_cpu: The number of CPUs to use
31
32
33
34
35
36
37
38
      maxseq: The maximum number of rows in an input alignment. Note that this
        parameter is only supported in HHBlits version 3.1 and higher.

    Raises:
      RuntimeError: If HHsearch binary not found within the path.
    """
    self.binary_path = binary_path
    self.databases = databases
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
39
    self.n_cpu = n_cpu
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
    self.maxseq = maxseq

    for database_path in self.databases:
      if not glob.glob(database_path + '_*'):
        logging.error('Could not find HHsearch database %s', database_path)
        raise ValueError(f'Could not find HHsearch database {database_path}')

  def query(self, a3m: str) -> str:
    """Queries the database using HHsearch using a given a3m."""
    with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir:
      input_path = os.path.join(query_tmp_dir, 'query.a3m')
      hhr_path = os.path.join(query_tmp_dir, 'output.hhr')
      with open(input_path, 'w') as f:
        f.write(a3m)

      db_cmd = []
      for db_path in self.databases:
        db_cmd.append('-d')
        db_cmd.append(db_path)
      cmd = [self.binary_path,
             '-i', input_path,
             '-o', hhr_path,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
62
63
             '-maxseq', str(self.maxseq),
             '-cpu', str(self.n_cpu),
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
             ] + db_cmd

      logging.info('Launching subprocess "%s"', ' '.join(cmd))
      process = subprocess.Popen(
          cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
      with utils.timing('HHsearch query'):
        stdout, stderr = process.communicate()
        retcode = process.wait()

      if retcode:
        # Stderr is truncated to prevent proto size errors in Beam.
        raise RuntimeError(
            'HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' % (
                stdout.decode('utf-8'), stderr[:100_000].decode('utf-8')))

      with open(hhr_path) as f:
        hhr = f.read()
    return hhr