precompute_alignments.py 4.61 KB
Newer Older
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
1
2
3
4
5
import argparse
import logging
import os
import tempfile

sft-managed's avatar
sft-managed committed
6
7
import openfold.data.mmcif_parsing as mmcif_parsing
from openfold.data.data_pipeline import AlignmentRunner
8
from openfold.data.parsers import parse_fasta
Gustaf's avatar
Gustaf committed
9
from openfold.np import protein, residue_constants
sft-managed's avatar
sft-managed committed
10
11
12
13
14
15

from utils import add_data_args

#python3 scripts/precompute_alignments.py mmcif_dir/ alignment_dir/     data/uniref90/uniref90.fasta     data/mgnify/mgy_clusters_2018_12.fa     data/pdb70/pdb70     data/pdb_mmcif/mmcif_files/     data/uniclust30/uniclust30_2018_08/uniclust30_2018_08     --bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt     --cpus 16  --jackhmmer_binary_path /home/u00u98too4mkqFBu8M357/openfold/lib/conda/envs/openfold_venv/bin/jackhmmer  --hhblits_binary_path /home/u00u98too4mkqFBu8M357/openfold/lib/conda/envs/openfold_venv/bin/hhblits  --hhsearch_binary_path /home/u00u98too4mkqFBu8M357/openfold/lib/conda/envs/openfold_venv/bin/hhsearch  --kalign_binary_path /home/u00u98too4mkqFBu8M357/openfold/lib/conda/envs/openfold_venv/bin/kalign

logging.basicConfig(level=logging.DEBUG)
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36


def main(args):
    # Build the alignment tool runner
    alignment_runner = AlignmentRunner(
        jackhmmer_binary_path=args.jackhmmer_binary_path,
        hhblits_binary_path=args.hhblits_binary_path,
        hhsearch_binary_path=args.hhsearch_binary_path,
        uniref90_database_path=args.uniref90_database_path,
        mgnify_database_path=args.mgnify_database_path,
        bfd_database_path=args.bfd_database_path,
        uniclust30_database_path=args.uniclust30_database_path,
        pdb70_database_path=args.pdb70_database_path,
        use_small_bfd=args.bfd_database_path is None,
        no_cpus=args.cpus,
    )

    for f in os.listdir(args.input_dir):
        path = os.path.join(args.input_dir, f)
        file_id = os.path.splitext(f)[0]
        seqs = {}
Gustaf's avatar
Gustaf committed
37
        if(f.endswith('.cif')):
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
            with open(path, 'r') as fp:
                mmcif_str = fp.read()
            mmcif = mmcif_parsing.parse(
                file_id=file_id, mmcif_string=mmcif_str
            )
            if(mmcif.mmcif_object is None):
                logging.warning(f'Failed to parse {f}...')
                if(args.raise_errors):
                    raise list(mmcif.errors.values())[0]
                else:
                    continue
            mmcif = mmcif.mmcif_object
            for k,v in mmcif.chain_to_seqres.items():
                chain_id = '_'.join([file_id, k])
                seqs[chain_id] = v
53
        elif(f.endswith('.fasta') or f.endswith('.fa')):
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
54
55
            with open(path, 'r') as fp:
                fasta_str = fp.read()
56
            input_seqs, _ = parse_fasta(fasta_str)
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
57
58
59
60
61
62
63
64
            if len(input_seqs) != 1: 
                msg = f'More than one input_sequence found in {f}'
                if(args.raise_errors):
                    raise ValueError(msg)
                else:
                    logging.warning(msg)
            input_sequence = input_seqs[0]
            seqs[file_id] = input_sequence
Gustaf's avatar
Gustaf committed
65
66
67
68
        elif(f.endswith('.core')):
            with open(path, 'r') as fp:
                core_str = fp.read()
            core_prot = protein.from_proteinnet_string(core_str)
Gustaf's avatar
Gustaf committed
69
            aatype = core_prot.aatype
Gustaf's avatar
Gustaf committed
70
71
72
73
74
            seq = ''.join([
                residue_constants.restypes_with_x[aatype[i]] 
                for i in range(len(aatype))
            ])
            seqs[file_id] = seq
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
75
76
77
78
79
80
81
82
83
84
85
        else:
            continue

        for name, seq in seqs.items():
            alignment_dir = os.path.join(args.output_dir, name)
            if(os.path.isdir(alignment_dir)):
                logging.info(f'{f} has already been processed. Skipping...')
                continue

            os.makedirs(alignment_dir)

Gustaf's avatar
Gustaf committed
86
87
88
            fd, fasta_path = tempfile.mkstemp(suffix=".fasta")
            with os.fdopen(fd, 'w') as fp:
                fp.write(f'>query\n{seq}')
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
89
90

            alignment_runner.run(
Gustaf's avatar
Gustaf committed
91
                fasta_path, alignment_dir
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
92
93
            )

Gustaf's avatar
Gustaf committed
94
            os.remove(fasta_path)
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
95
96
97
98
99
100


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "input_dir", type=str,
101
102
        help="""Path to directory containing mmCIF, FASTA and/or ProteinNet
                .core files"""
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
    )
    parser.add_argument(
        "output_dir", type=str,
        help="Directory in which to output alignments"
    )
    add_data_args(parser)
    parser.add_argument(
        "--raise_errors", type=bool, default=False,
        help="Whether to crash on parsing errors"
    )
    parser.add_argument(
        "--cpus", type=int, default=4,
        help="Number of CPUs to use"
    )

    args = parser.parse_args()

    main(args)