hhsearch.py 3.52 KB
Newer Older
Augustin-Zidek's avatar
Augustin-Zidek committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Library to run HHsearch from Python."""

import glob
import os
import subprocess
from typing import Sequence

from absl import logging

24
from alphafold.data import parsers
Augustin-Zidek's avatar
Augustin-Zidek committed
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from alphafold.data.tools import utils
# Internal import (7716).


class HHSearch:
  """Python wrapper of the HHsearch binary."""

  def __init__(self,
               *,
               binary_path: str,
               databases: Sequence[str],
               maxseq: int = 1_000_000):
    """Initializes the Python HHsearch wrapper.

    Args:
      binary_path: The path to the HHsearch executable.
      databases: A sequence of HHsearch database paths. This should be the
        common prefix for the database files (i.e. up to but not including
        _hhm.ffindex etc.)
      maxseq: The maximum number of rows in an input alignment. Note that this
        parameter is only supported in HHBlits version 3.1 and higher.

    Raises:
      RuntimeError: If HHsearch binary not found within the path.
    """
    self.binary_path = binary_path
    self.databases = databases
    self.maxseq = maxseq

    for database_path in self.databases:
      if not glob.glob(database_path + '_*'):
        logging.error('Could not find HHsearch database %s', database_path)
        raise ValueError(f'Could not find HHsearch database {database_path}')

59
60
61
62
63
64
65
66
  @property
  def output_format(self) -> str:
    return 'hhr'

  @property
  def input_format(self) -> str:
    return 'a3m'

Augustin-Zidek's avatar
Augustin-Zidek committed
67
68
  def query(self, a3m: str) -> str:
    """Queries the database using HHsearch using a given a3m."""
69
    with utils.tmpdir_manager() as query_tmp_dir:
Augustin-Zidek's avatar
Augustin-Zidek committed
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
      input_path = os.path.join(query_tmp_dir, 'query.a3m')
      hhr_path = os.path.join(query_tmp_dir, 'output.hhr')
      with open(input_path, 'w') as f:
        f.write(a3m)

      db_cmd = []
      for db_path in self.databases:
        db_cmd.append('-d')
        db_cmd.append(db_path)
      cmd = [self.binary_path,
             '-i', input_path,
             '-o', hhr_path,
             '-maxseq', str(self.maxseq)
             ] + db_cmd

      logging.info('Launching subprocess "%s"', ' '.join(cmd))
      process = subprocess.Popen(
          cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
      with utils.timing('HHsearch query'):
        stdout, stderr = process.communicate()
        retcode = process.wait()

      if retcode:
        # Stderr is truncated to prevent proto size errors in Beam.
        raise RuntimeError(
            'HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' % (
                stdout.decode('utf-8'), stderr[:100_000].decode('utf-8')))

      with open(hhr_path) as f:
        hhr = f.read()
    return hhr
101
102
103
104
105
106
107

  def get_template_hits(self,
                        output_string: str,
                        input_sequence: str) -> Sequence[parsers.TemplateHit]:
    """Gets parsed template hits from the raw string output by the tool."""
    del input_sequence  # Used by hmmseach but not needed for hhsearch.
    return parsers.parse_hhr(output_string)